In [3]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "data.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "sbhatti/financial-sentiment-analysis",
  file_path
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


First 5 records:                                             Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral


In [4]:
import pandas as pd

df = pd.read_csv("data.csv")

print("First 5 rows of the DataFrame:\n", df.head())

print("\nShape of the DataFrame:", df.shape)

print("\nInformation about the DataFrame:")
df.info()

print("\nMissing values in the DataFrame:\n", df.isnull().sum())

df = df.dropna()

print("\nDistribution of sentiment labels:\n", df['Sentiment'].value_counts())

First 5 rows of the DataFrame:
                                             Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral

Shape of the DataFrame: (5842, 2)

Information about the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB

Missing values in the DataFrame:
 Sentence     0
Sentiment    0
dtype: int64

Distribution of sentiment labels:
 Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64


In [5]:
import re

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

df['cleaned_sentence'] = df['Sentence'].apply(clean_text)
print("Cleaned sentences:\n", df['cleaned_sentence'].head())


Cleaned sentences:
 0    the geosolutions technology will leverage bene...
1          esi on lows down  to  bk a real possibility
2    for the last quarter of   componenta s net sal...
3    according to the finnishrussian chamber of com...
4    the swedish buyout firm has sold its remaining...
Name: cleaned_sentence, dtype: object


In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Sentiment'])
print("Encoded labels:\n", df['label'].value_counts())
#1 is for neutral
#2 is for positive
#0 is for negative

Encoded labels:
 label
1    3130
2    1852
0     860
Name: count, dtype: int64


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Limit features for efficiency
X_tfidf = vectorizer.fit_transform(df['cleaned_sentence'])

print("TF-IDF feature matrix shape:", X_tfidf.shape)

TF-IDF feature matrix shape: (5842, 5000)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['label'], test_size=0.2, random_state=42)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.3f}")

# Print classification report for detailed metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.728

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.15      0.24       175
           1       0.71      0.91      0.80       622
           2       0.79      0.70      0.74       372

    accuracy                           0.73      1169
   macro avg       0.68      0.59      0.59      1169
weighted avg       0.71      0.73      0.70      1169



In [10]:
import pickle

# Save the model
with open("sentiment_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [11]:
!pip install flask pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [12]:
!pip install pyngrok --quiet

In [13]:
!ngrok config add-authtoken 2uU6McoAccmroZOncSzMuTfYeiH_5YBg47TufHEzJAiUP82q4

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [14]:
from flask import Flask, request, jsonify
from pyngrok import ngrok
import pickle
import nest_asyncio

# Apply nest_asyncio for Jupyter/Colab compatibility
nest_asyncio.apply()

# Load the saved model and vectorizer
with open("sentiment_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("tfidf_vectorizer.pkl", "rb") as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

# Initialize Flask app
app = Flask(__name__)

# Home route
@app.route('/')
def home():
    return "Financial News Sentiment Analyzer is running!"

# Sentiment analysis route
@app.route('/analyze', methods=['POST'])
def analyze_sentiment():
    data = request.json['headline']
    processed_data = vectorizer.transform([data])
    prediction = model.predict(processed_data)[0]
    return jsonify({"sentiment": prediction})

# Start ngrok and expose the Flask app
if __name__ == '__main__':
    # Start ngrok tunnel
    public_url = ngrok.connect(5000).public_url
    print(f" * ngrok tunnel: {public_url}")

    # Run the Flask app
    app.run(host='0.0.0.0', port=5000)

 * ngrok tunnel: https://6a55-34-168-31-17.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [23/Mar/2025 10:51:43] "[31m[1mPOST / HTTP/1.1[0m" 405 -
INFO:werkzeug:127.0.0.1 - - [23/Mar/2025 11:03:19] "[31m[1mPOST / HTTP/1.1[0m" 405 -
INFO:werkzeug:127.0.0.1 - - [23/Mar/2025 11:04:07] "[31m[1mPOST / HTTP/1.1[0m" 405 -
