In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Dropout, GlobalAveragePooling1D, Concatenate, Lambda
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

**1. Data Extraction Module**

In [None]:
import os, json, re
import pandas as pd
from datetime import datetime

def read_tweets_from_directory(directory):
    tweets_data = {'Date': [], 'Tweet': [], 'Followers': [], 'Friends': [], 'Statuses': [], 'Favorites': []}
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    parsed_tweet = json.loads(line)
                    tweet_date = datetime.strptime(parsed_tweet.get('created_at'), '%a %b %d %H:%M:%S %z %Y')
                    tweets_data['Date'].append(tweet_date)
                    tweets_data['Tweet'].append(parsed_tweet.get('text'))
                    tweets_data['Followers'].append(parsed_tweet.get('user', {}).get('followers_count'))
                    tweets_data['Friends'].append(parsed_tweet.get('user', {}).get('friends_count'))
                    tweets_data['Statuses'].append(parsed_tweet.get('user', {}).get('statuses_count'))
                    tweets_data['Favorites'].append(parsed_tweet.get('user', {}).get('favourites_count'))
                except Exception as e:
                    print("Error:", e)
                    continue
    return pd.DataFrame(tweets_data)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Date,Tweet,Followers,Friends,Statuses,Favorites
0,2014-03-14 17:37:09,$MSFT Gaming Console Sales Data Is Mostly Nois...,46,0,57511,0
1,2014-03-14 22:31:15,$MSFT - Statement of Changes in Beneficial Own...,82,0,88569,0
2,2014-03-14 16:01:09,$MSFT Banks face additional Microsoft costs fr...,46,0,57483,0
3,2014-01-10 17:01:16,$MSFT IntraLinks Steps Up To The Plate - Becau...,4,0,21489,0
4,2014-01-10 18:10:57,$MSFT News Alert: http://t.co/9xOFoxdABD Upd...,54,123,606,0


**2. Text Cleaning Module**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text).lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in tokens])


**3. FinBERT Sentiment Scoring Module**

In [None]:
from transformers import pipeline

finbert_sentiment = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", tokenizer="yiyanghkust/finbert-tone")

def get_sentiment(text):
    try:
        result = finbert_sentiment(text)
        return result[0]['label'], result[0]['score']
    except Exception as e:
        return 'neutral', 0.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]



In [None]:
mfst_tweets.head()

Unnamed: 0,Date,Tweet,Followers,Friends,Statuses,Favorites,Cleaned_Tweet,Sentiment,Sentiment_Score
0,2014-03-14 17:37:09,$MSFT Gaming Console Sales Data Is Mostly Nois...,46,0,57511,0,msft game consol sale data mostli nois,Neutral,0.999995
1,2014-03-14 22:31:15,$MSFT - Statement of Changes in Beneficial Own...,82,0,88569,0,msft statement chang benefici ownership,Neutral,0.999954
2,2014-03-14 16:01:09,$MSFT Banks face additional Microsoft costs fr...,46,0,57483,0,msft bank face addit microsoft cost outdat atm...,Neutral,0.997042
3,2014-01-10 17:01:16,$MSFT IntraLinks Steps Up To The Plate - Becau...,4,0,21489,0,msft intralink step plate world need anoth fil...,Neutral,0.99995
4,2014-01-10 18:10:57,$MSFT News Alert: http://t.co/9xOFoxdABD Upd...,54,123,606,0,msft news alert updat option view option view,Neutral,0.999833


**4. RBWTD Calculation Module**

In [None]:
def calculate_rbwtd_weights(df, current_time, decay_lambda=0.00001):
    df['Time_Diff'] = (current_time - df['Date']).dt.total_seconds() / 3600
    df['Exp_Time_Weight'] = np.exp(-decay_lambda * df['Time_Diff'])
    df['Linear_Time_Weight'] = df['Time_Diff'].apply(lambda x: max(1 - decay_lambda * x, 0))
    df['Log_Time_Weight'] = np.log1p(df['Time_Diff'])

    for feature in ['Followers', 'Friends', 'Statuses', 'Favorites']:
        df[f'Rank_{feature}'] = df[feature].rank(method='max', ascending=False)

    df['Rank_Weight'] = (1 / df['Rank_Followers'] + 1 / df['Rank_Friends'] +
                         1 / df['Rank_Statuses'] + 1 / df['Rank_Favorites'])

    df['Weighted_Sentiment_Exp'] = df['Sentiment_Score'] * df['Rank_Weight'] * df['Exp_Time_Weight']

    # Drop intermediate columns if needed
    drop_cols = ['Time_Diff', 'Exp_Time_Weight', 'Linear_Time_Weight', 'Log_Time_Weight',
                 'Rank_Followers', 'Rank_Friends', 'Rank_Statuses', 'Rank_Favorites']
    df.drop(columns=drop_cols, inplace=True)

    return df


In [None]:
msft_tweets.head()

Unnamed: 0,Date,Tweet,Followers,Friends,Statuses,Favorites,Cleaned_Tweet,Sentiment,Sentiment_Score,Time_Diff,...,Linear_Time_Weight,Log_Time_Weight,Rank_Followers,Rank_Friends,Rank_Statuses,Rank_Favorites,Rank_Weight,Weighted_Sentiment_Exp,Weighted_Sentiment_Linear,Weighted_Sentiment_Log
0,2014-01-01 00:00:48,Dow #Stocks Trend $AXP $UTX $CSCO $KO $HD $DIS...,545,7,10133,0,dow stock trend axp utx csco ko hd di v ibm dd...,Neutral,0.999833,19703.986667,...,0.80296,9.888627,1562.0,2711.0,2024.0,3625.0,0.001779,0.001461,0.001428,0.017589
1,2014-01-01 14:49:40,RT @ACInvestorBlog: Stocks to Watch for Januar...,1154,1998,3133,7,rt acinvestorblog stock watch januari gure irm...,Neutral,0.999988,19689.172222,...,0.803108,9.887875,959.0,286.0,2594.0,1680.0,0.00552,0.004533,0.004433,0.05458
2,2014-01-01 17:36:01,@BenedictEvans shocking that $MSFT is missing ...,78,132,1557,124,benedictevan shock msft miss opportun,Negative,0.994826,19686.399722,...,0.803136,9.887734,2623.0,1850.0,2834.0,1025.0,0.00225,0.001839,0.001798,0.022135
3,2014-01-02 06:28:12,Corporate 'Miserliness' In The United States: ...,50,0,30189,0,corpor miserli unit state case microsoft msft ...,Neutral,0.999983,19673.53,...,0.803265,9.88708,2795.0,3625.0,1222.0,3625.0,0.001728,0.001419,0.001388,0.017083
4,2014-01-02 09:48:35,Microsoft Corporation : Microsoft Assigned Pat...,924,1,76124,0,microsoft corpor microsoft assign patent msft,Neutral,0.999989,19670.190278,...,0.803298,9.88691,1137.0,3263.0,659.0,3625.0,0.002979,0.002447,0.002393,0.029456


In [None]:
# Columns to drop
columns_to_drop = ['Time_Diff', 'Exp_Time_Weight', 'Linear_Time_Weight', 'Log_Time_Weight',
                   'Rank_Followers', 'Rank_Friends', 'Rank_Statuses', 'Rank_Favorites',
                   'Rank_Weight', 'Weighted_Sentiment_Linear', 'Weighted_Sentiment_Log']

# Drop the columns
msft_tweets.drop(columns_to_drop, axis=1, inplace=True)

Unnamed: 0_level_0,Tweet,Followers,Friends,Statuses,Favorites,Cleaned_Tweet,Sentiment,Sentiment_Score,Time_Diff,Weighted_Sentiment_Exp
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2013-12-31 23:10:08,RT @philstockworld: Summary of Yesterday's Web...,6,105,19,3,rt philstockworld summari yesterday webcast fe...,Neutral,0.999991,19704.831111,0.000208
2014-01-01 01:18:36,RT @philstockworld: Summary of Yesterday's Web...,6,85,11,1,rt philstockworld summari yesterday webcast fe...,Neutral,0.999991,19702.69,0.000199
2014-01-01 01:52:31,#iPhone users are more intelligent than #Samsu...,17449,256,43179,1,iphon user intellig samsung blackberri htc own...,Neutral,0.997922,19702.124722,0.001603
2014-01-01 03:29:29,RT @philstockworld: Summary of Yesterday's Web...,3,69,11,1,rt philstockworld summari yesterday webcast fe...,Neutral,0.999991,19700.508611,0.000194
2014-01-01 03:59:03,RT @philstockworld: Summary of Yesterday's Web...,2,56,4,0,rt philstockworld summari yesterday webcast fe...,Neutral,0.999991,19700.015833,0.000173


In [None]:
sentiment_counts = msft_tweets['Sentiment'].value_counts()
print("Sentiment Counts:\n", sentiment_counts)

Sentiment Counts:
 Sentiment
Neutral     17927
Positive     2405
Negative      582
Name: count, dtype: int64


In [None]:
# Specify the path in your Google Drive where you want to save the file
file_path = '/content/drive/MyDrive/Colab Notebooks/Sentiment Price/Experiment/msft_finbert.csv'

# Save the DataFrame to the specified path
msft_tweets.to_csv(file_path, index=False)

print(f"DataFrame saved to {file_path}")

DataFrame saved to /content/drive/MyDrive/Colab Notebooks/Sentiment Price/Experiment/msft_finbert.csv


**Aggregate Sentiment by Date**

In [None]:
def aggregate_daily_sentiment(msft_tweets):
    daily_sentiment_data = []
    for date, group in msft_tweets.groupby('Date'):
        sentiment = 0
        if 1 in group['Sentiment'].values:
            sentiment = 1
        elif 2 in group['Sentiment'].values:
            sentiment = 2
        sentiment_score = group['Sentiment_Score'].mean()
        weighted_sentiment_exp = group['Weighted_Sentiment_Exp'].mean()
        daily_sentiment_data.append([date, sentiment, sentiment_score, weighted_sentiment_exp])

    df = pd.DataFrame(daily_sentiment_data, columns=['Date', 'Sentiment', 'Sentiment_Score', 'Weighted_Sentiment_Exp'])
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    return df.groupby('Date').apply(lambda df: df[df['Sentiment'] == df['Sentiment'].max()].iloc[0]).reset_index(drop=True)

# ========== 2. Merge with Price Data ==========
def merge_price_sentiment(msft_price, sentiment_df):
    msft_price = msft_price.copy()
    msft_price['Date'] = pd.to_datetime(msft_price.index).date
    return pd.merge(msft_price, sentiment_df, on='Date', how='left').drop(columns=['Date'])

# ========== 3. Scale and Reduce Features ==========
def prepare_features(df, label_col='signal', n_components=10, seq_len=30):
    X = df.drop(columns=[label_col])
    y = df[label_col].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Convert to sequences for RNN input
    X_seq, y_seq = [], []
    for i in range(len(X_seq) - seq_len):
        X_seq.append(X_seq[i:i+seq_len])
        y_seq.append(y[i + seq_len])
    return np.array(X_seq), np.array(y_seq)