In [289]:
# Import necessary libraries
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

In [290]:
#!pip install --upgrade pandas

In [None]:
df = pd.read_csv(r"C:\Users\neeha\Downloads\stock_sentiment\data\apple_final_data.csv")

In [292]:
df = df.dropna()

In [293]:
df.head()

Unnamed: 0,scraped_at,title,vader_sentiment,finbert_sentiment,headline_date,date,close,next_close,label
0,2024-11-26 00:00:00+00:00,What Is a Stock Market Index?,neutral,neutral,2024-11-26,2024-11-26,234.801834,234.671982,0
1,2024-11-26 00:00:00+00:00,"Could Investing 1,000 in Apple Make You a Mill...",neutral,neutral,2024-11-26,2024-11-26,234.801834,234.671982,0
2,2024-11-26 00:00:00+00:00,Dow Jones Industrial Average,neutral,neutral,2024-11-26,2024-11-26,234.801834,234.671982,0
3,2024-11-26 00:00:00+00:00,What Is the SP 500 Index?,neutral,neutral,2024-11-26,2024-11-26,234.801834,234.671982,0
4,2024-11-25 00:00:00+00:00,A Complete Guide to College Savings Accounts i...,neutral,neutral,2024-11-25,2024-11-25,232.614243,234.801834,1


In [294]:
df.shape

(20923, 9)


### Feature Engineering

In [295]:
# Length of news headline
df["news_length"] = df["title"].astype(str).apply(len)

In [296]:
# Map sentiments from text labels to numeric
sentiment_map = {"positive" : 1 ,"negative" : -1, "neutral" : 0}
df['vader_sentiment'] = df['vader_sentiment'].map(sentiment_map)
df['finbert_sentiment'] = df['finbert_sentiment'].map(sentiment_map)

In [297]:
# Extract day of the week (0 = Monday, 6 = Sunday
df['day_of_week'] = pd.to_datetime(df['scraped_at']).dt.weekday

In [298]:
# Create sentiment agreement feature
df['sentiment_agreement'] = (df['vader_sentiment'] == df['finbert_sentiment']).astype(int)

In [299]:
# raw VADER sentiment score
vader = SentimentIntensityAnalyzer()

df["vader_score"] = df["title"].apply(lambda x: vader.polarity_scores(x)["compound"])

In [300]:
features = ['vader_score','finbert_sentiment','day_of_week','sentiment_agreement','news_length']
target = ['label']

In [301]:
x = df[features]
y = df[target]

In [302]:
x.head()

Unnamed: 0,vader_score,finbert_sentiment,day_of_week,sentiment_agreement,news_length
0,0.0,0,1,1,29
1,0.0,0,1,1,54
2,0.0,0,1,1,28
3,0.0,0,1,1,25
4,0.0,0,0,1,52


In [303]:
#!pip install xgboost

In [304]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [305]:
# Calculate scale ratio for XGBoost to handle imbalance
scale_ratio = len(y[y == 0]) / len(y[y == 1])

In [306]:
xgb_model = XGBClassifier(use_label_encoder = False, eval_metric = 'logloss', scale_pos_weight = scale_ratio)

In [307]:
xgb_model.fit(x_train,y_train)

Parameters: { "use_label_encoder" } are not used.



In [308]:
pred = model.predict(x_test)

In [309]:
print(" Model: XGBoost")
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))
print("-" * 50)

 Model: XGBoost
Accuracy: 0.5328554360812425
              precision    recall  f1-score   support

           0       0.51      0.51      0.51      1978
           1       0.56      0.55      0.56      2207

    accuracy                           0.53      4185
   macro avg       0.53      0.53      0.53      4185
weighted avg       0.53      0.53      0.53      4185

--------------------------------------------------


In [310]:
# comparing with other models
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", scale_pos_weight=scale_ratio),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
}

for name, model in models.items():
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))
    print("-" * 50)


Parameters: { "use_label_encoder" } are not used.

  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


Model: XGBoost
Accuracy: 0.5302270011947431
              precision    recall  f1-score   support

           0       0.50      0.45      0.48      1978
           1       0.55      0.60      0.57      2207

    accuracy                           0.53      4185
   macro avg       0.53      0.53      0.53      4185
weighted avg       0.53      0.53      0.53      4185

--------------------------------------------------
Model: Logistic Regression
Accuracy: 0.5273596176821983
              precision    recall  f1-score   support

           0       0.50      0.00      0.00      1978
           1       0.53      1.00      0.69      2207

    accuracy                           0.53      4185
   macro avg       0.51      0.50      0.35      4185
weighted avg       0.51      0.53      0.37      4185

--------------------------------------------------
Model: Random Forest
Accuracy: 0.535483870967742
              precision    recall  f1-score   support

           0       0.51      0.51      0

In [311]:
# Save the trained XGBoost model
joblib.dump(xgb_model, "apple_stock_sentiment_model.pkl")

['apple_stock_sentiment_model.pkl']