In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Add src to path so we can import our custom modules
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from preprocessing import TweetCleaner
from data_loader import DataLoader

# Visualization Style
sns.set_style('whitegrid')
%matplotlib inline

**Load and Clean data**

In [2]:
# Load the file directly to check headers
vix_test = pd.read_csv('../data/raw/vix_data.csv')
print("Columns found:", vix_test.columns.tolist())

Columns found: ['DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE']


In [3]:
# 1. Initialize Loader
loader = DataLoader(
    tweets_path='../data/raw/trump_tweets.csv',
    vix_path='../data/raw/vix_data.csv'
)

# 2. Load and Clean Tweets (this takes a moment)
cleaner = TweetCleaner()
raw_tweets = pd.read_csv(loader.tweets_path)
clean_tweets = cleaner.preprocess_dataframe(raw_tweets, text_column='content')

# 3. Merge with VIX
# (This step aligns the dates using the logic in src/data_loader.py)
df = loader.merge_data(clean_tweets)

print(f"✅ Data Ready! Final Dataset shape: {df.shape}")
df.head(3)

Cleaning 43352 tweets...
Loading VIX from: ../data/raw/vix_data.csv
Merging datasets...
Merge Complete. Shape: (33669, 13)
✅ Data Ready! Final Dataset shape: (33669, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets['date'] = pd.to_datetime(pd.to_datetime(df_tweets['date']).dt.date)


Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags,cleaned_text,tweet_length,Date,Close,target_value
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04,510,917,,,be sure to tune in and watch donald trump on l...,116,2009-05-04,34.53,33.36
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04,34,267,,,donald trump will be appearing on the view tom...,130,2009-05-04,34.53,33.36
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08,13,19,,,donald trump reads top ten financial tips on l...,86,2009-05-08,32.05,32.87


**Feature Engineering (Preparing X and y)**
For the "Baseline" model, we will just use simple numerical features (Sentiment & Volume) instead of complex text embeddings, to prove the concept first.

In [4]:
# Create simple features: Daily Tweet Volume
# We aggregate by date to get ONE row per day (Market Open to Market Close)
daily_data = df.groupby('date').agg({
    'tweet_length': ['mean', 'count'],  # Avg length and Number of tweets that day
    'Close': 'first',                   # VIX Close today
    'target_value': 'first'             # VIX Close tomorrow (Target)
}).reset_index()

# Flatten columns
daily_data.columns = ['date', 'avg_len', 'tweet_count', 'vix_today', 'vix_tomorrow']

# Define Features (X) and Target (y)
X = daily_data[['avg_len', 'tweet_count', 'vix_today']].values
y = (daily_data['vix_tomorrow'] > daily_data['vix_today']).astype(int) # 1 if Volatility increases, 0 if decreases

# Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) # No shuffle because it's time-series!

# Scale Data (Deep Learning requires 0-1 scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features Prepared.")

Features Prepared.


**Model 1 - The Baseline (Simple Dense Network)**

In [5]:
def build_baseline_model():
    model = Sequential([
        Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid') # Binary Output (Up/Down)
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model_1 = build_baseline_model()
history_1 = model_1.fit(X_train_scaled, y_train, epochs=50, validation_split=0.2, verbose=0)

print(f"Model 1 Accuracy: {model_1.evaluate(X_test_scaled, y_test)[1]:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4919 - loss: 0.8979 
Model 1 Accuracy: 0.4919


**Model 2 - The "Deep" Model (More Layers)**

In [6]:
def build_deep_model():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model_2 = build_deep_model()
history_2 = model_2.fit(X_train_scaled, y_train, epochs=50, validation_split=0.2, verbose=0)

print(f"Model 2 Accuracy: {model_2.evaluate(X_test_scaled, y_test)[1]:.4f}")

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4959 - loss: 2.0511 
Model 2 Accuracy: 0.4959


**Model 3 - Regularized (To Prevent Overfitting)**

In [7]:
def build_regularized_model():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),  # Drop 30% of neurons to prevent memorization
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model_3 = build_regularized_model()
history_3 = model_3.fit(X_train_scaled, y_train, epochs=50, validation_split=0.2, verbose=0)

print(f"Model 3 Accuracy: {model_3.evaluate(X_test_scaled, y_test)[1]:.4f}")

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4858 - loss: 0.7150 
Model 3 Accuracy: 0.4858


**Plot comparison**

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history_1.history['val_accuracy'], label='Baseline')
plt.plot(history_2.history['val_accuracy'], label='Deep Model')
plt.plot(history_3.history['val_accuracy'], label='Regularized')
plt.title('Model Comparison: Validation Accuracy over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()