In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import math
import time
from pickle import load
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import statsmodels.api as sm
import re
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from pickle import dump
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata
import warnings
from textblob import TextBlob
from math import sqrt

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
#loading the yahhoo finance and tweets stock datasets scraped from twitter
tweets_data = pd.read_csv('/content/drive/MyDrive/Jatin/Stock_Sentiment_Analysis/stock_data_tweets.csv')
stocks_data = pd.read_csv('/content/drive/MyDrive/Jatin/Stock_Sentiment_Analysis/stock_data_yfinance.csv')

In [None]:
stocks_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA
1,2021-10-01,259.466675,260.26001,254.529999,258.406677,258.406677,51094200,TSLA
2,2021-10-04,265.5,268.98999,258.706665,260.51001,260.51001,91449900,TSLA
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800,TSLA
4,2021-10-06,258.733337,262.220001,257.73999,260.916656,260.916656,43898400,TSLA


In [None]:
tweets_data.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."


In [None]:
print(tweets_data.shape,stocks_data.shape)

(80793, 4) (6300, 8)


In [None]:
#combining the tweet columns into one and then cleaning the tweet texts
tweets_list = [' '.join(map(str, tweets_data.iloc[row, 1:3])) for row in range(len(tweets_data))]
print(tweets_list[0])

cleaned_tweets = [re.sub(r"(\$|\n)", '', tweet) for tweet in tweets_list]
print(cleaned_tweets[20])

#adding the cleaned tweets to the Dataframe theen converting 'Date' column to datetime and extract the date
tweets_data['Cleaned_tweets'] = cleaned_tweets
print(tweets_data.head(3))


tweets_data['Date'] = pd.to_datetime(tweets_data['Date']).dt.date
print(tweets_data.head())

Mainstream media has done an amazing job at brainwashing people. Today at work, we were asked what companies we believe in &amp; I said @Tesla because they make the safest cars &amp; EVERYONE disagreed with me because they heard“they catch on fire &amp; the batteries cost 20k to replace” TSLA
According to California rules that are now also being adopted by New York state, 69% of all new cars sold will have to be EVs by 2030. @elonmusk TSLA TSLA
         Date                                              Tweet Stock Name  \
0  2022-09-29  Mainstream media has done an amazing job at br...       TSLA   
1  2022-09-29  Tesla delivery estimates are at around 364k fr...       TSLA   
2  2022-09-29  3/ Even if I include 63.0M unvested RSUs as of...       TSLA   

  Company Name                                     Cleaned_tweets  
0  Tesla, Inc.  Mainstream media has done an amazing job at br...  
1  Tesla, Inc.  Tesla delivery estimates are at around 364k fr...  
2  Tesla, Inc.  3/ Even if I i

In [None]:
#defining the sentiment analysis functions by calculating subjectivity, polarity
def compute_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def compute_polarity(text):
    return TextBlob(text).sentiment.polarity

tweets_data['Subjectivity'] = tweets_data['Cleaned_tweets'].apply(compute_subjectivity)
tweets_data['Polarity'] = tweets_data['Cleaned_tweets'].apply(compute_polarity)
print(tweets_data.head(3))

#performing VADER sentiment analysis from nltk libarry
def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(text)

nltk.download('vader_lexicon', quiet=True)

compound_scores, negative_scores, neutral_scores, positive_scores = [], [], [], []

for tweet in tweets_data['Cleaned_tweets']:
    sentiment = analyze_sentiment(tweet)
    compound_scores.append(sentiment['compound'])
    negative_scores.append(sentiment['neg'])
    neutral_scores.append(sentiment['neu'])
    positive_scores.append(sentiment['pos'])

tweets_data['Compound'] = compound_scores
tweets_data['Negative'] = negative_scores
tweets_data['Neutral'] = neutral_scores
tweets_data['Positive'] = positive_scores
print(tweets_data.head(1))

         Date                                              Tweet Stock Name  \
0  2022-09-29  Mainstream media has done an amazing job at br...       TSLA   
1  2022-09-29  Tesla delivery estimates are at around 364k fr...       TSLA   
2  2022-09-29  3/ Even if I include 63.0M unvested RSUs as of...       TSLA   

  Company Name                                     Cleaned_tweets  \
0  Tesla, Inc.  Mainstream media has done an amazing job at br...   
1  Tesla, Inc.  Tesla delivery estimates are at around 364k fr...   
2  Tesla, Inc.  3/ Even if I include 63.0M unvested RSUs as of...   

   Subjectivity  Polarity  
0      0.900000  0.600000  
1      0.000000  0.000000  
2      0.277273  0.018182  
         Date                                              Tweet Stock Name  \
0  2022-09-29  Mainstream media has done an amazing job at br...       TSLA   

  Company Name                                     Cleaned_tweets  \
0  Tesla, Inc.  Mainstream media has done an amazing job at br... 

In [None]:
#dropping unnecessary columns then merging the stock data with tweet sentiment data
columns_to_remove = ['Cleaned_tweets', 'Tweet', 'Stock Name', 'Company Name']
tweets_data.drop(columns=[col for col in columns_to_remove if col in tweets_data.columns], inplace=True)
print(tweets_data.head(3))

stocks_data['Date'] = pd.to_datetime(stocks_data['Date']).dt.date
merged_data = stocks_data.merge(tweets_data, on='Date', how='left').drop(columns=['Stock Name'])
print(merged_data.shape)
print(merged_data.head())

#binarizing the compound score and updating the DataFrame
threshold_value = 0.0
tweets_data['Binary_Compound'] = (tweets_data['Compound'] > threshold_value).astype(int)
tweets_data.drop(columns=['Compound'], inplace=True)
tweets_data.rename(columns={'Binary_Compound': 'Compound'}, inplace=True)
print(tweets_data.head())


         Date  Subjectivity  Polarity  Compound  Negative  Neutral  Positive
0  2022-09-29      0.900000  0.600000    0.0772     0.125    0.763     0.113
1  2022-09-29      0.000000  0.000000    0.0000     0.000    1.000     0.000
2  2022-09-29      0.277273  0.018182    0.2960     0.000    0.952     0.048
(1591900, 13)
         Date        Open        High         Low       Close   Adj Close  \
0  2021-09-30  260.333344  263.043335  258.333344  258.493347  258.493347   
1  2021-09-30  260.333344  263.043335  258.333344  258.493347  258.493347   
2  2021-09-30  260.333344  263.043335  258.333344  258.493347  258.493347   
3  2021-09-30  260.333344  263.043335  258.333344  258.493347  258.493347   
4  2021-09-30  260.333344  263.043335  258.333344  258.493347  258.493347   

     Volume  Subjectivity  Polarity  Compound  Negative  Neutral  Positive  
0  53868000      0.300000  0.125000    0.5093     0.000    0.914     0.086  
1  53868000      0.000000  0.000000   -0.1531     0.046    0.

In [None]:
#preparing the data for model training then splitting data into training and testing sets
X = merged_data.drop(columns=['Compound', 'Date']).values
y = merged_data['Compound'].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#encoding the labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [None]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

#training a XGBoost model and a Logistic Regression model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

#predicting and evaluating both the models
xgb_pred = xgb_model.predict(X_test)
lr_pred = lr_model.predict(X_test)

#evaluation metrics function
def evaluate_model(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    print(f"{model_name} Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}\n")

    return roc_auc

#evaluating the XGBoost model and the Logistic Regression model
xgb_roc_auc = evaluate_model(y_test, xgb_pred, "XGBoost")

lr_roc_auc = evaluate_model(y_test, lr_pred, "Logistic Regression")

#plotting the ROC Curve
def plot_roc_curve(y_test, y_pred, model_name):
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    plt.plot(fpr, tpr, label=f'{model_name} (area = {roc_auc_score(y_test, y_pred):.2f})')

plt.figure()
plot_roc_curve(y_test, xgb_pred, "XGBoost")
plot_roc_curve(y_test, lr_pred, "Logistic Regression")
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
#identifying the buy and sell points
merged_data['Signal'] = 0
merged_data.loc[merged_data['Compound'] > 0, 'Signal'] = 1  # Buy when sentiment is positive
merged_data.loc[merged_data['Compound'] <= 0, 'Signal'] = -1  # Sell when sentiment is non-positive

#calculating the returns, cumulative returns
merged_data['Return'] = merged_data['Close'].pct_change()
merged_data['Strategy_Return'] = merged_data['Signal'].shift(1) * merged_data['Return']

merged_data['Cumulative_Return'] = (1 + merged_data['Return']).cumprod()
merged_data['Cumulative_Strategy_Return'] = (1 + merged_data['Strategy_Return']).cumprod()

#plotting the cumulative returns
plt.figure(figsize=(14, 7))
plt.plot(merged_data['Date'], merged_data['Cumulative_Return'], label='Buy and Hold')
plt.plot(merged_data['Date'], merged_data['Cumulative_Strategy_Return'], label='Sentiment Strategy')
plt.legend(loc='upper left')
plt.title('Cumulative Returns')
plt.show()

# finally calculating the final portfolio value and returns
final_portfolio_value = merged_data['Cumulative_Strategy_Return'].iloc[-1]
total_return = (final_portfolio_value - 1) * 100
print(f"Final Portfolio Value: ${final_portfolio_value:.2f}")
print(f"Total Return: {total_return:.2f}%")


In [None]:
import numpy as np

#calcuualting Sharpe ratio, Maximum drawdown, Number of trades executed, Win ratio
sharpe_ratio = merged_data['Strategy_Return'].mean() / merged_data['Strategy_Return'].std() * np.sqrt(252)  # Assuming 252 trading days

rolling_max = merged_data['Cumulative_Strategy_Return'].cummax()
daily_drawdown = merged_data['Cumulative_Strategy_Return'] / rolling_max - 1.0
max_drawdown = daily_drawdown.cummin().min()


num_trades = merged_data['Signal'].diff().abs().sum()

winning_trades = merged_data[merged_data['Strategy_Return'] > 0].shape[0]
total_trades = merged_data[merged_data['Strategy_Return'] != 0].shape[0]
win_ratio = winning_trades / total_trades

#displaying performance metrics
print(f"Sharpe Ratio: {sharpe_ratio:.4f}")
print(f"Maximum Drawdown: {max_drawdown:.4f}")
print(f"Number of Trades Executed: {num_trades}")
print(f"Win Ratio: {win_ratio:.4f}")
