In [None]:
!pip install optuna torch tqdm scikit-learn


In [None]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


In [None]:
# 1) Load the dataset
df = pd.read_csv("/content/drive/MyDrive/FYP/cleaned_final.csv")

# 2) Basic checks
required_columns = ["Sentiment_Score", "reddit vader sentiment", "Options % Spike"]
for c in required_columns:
    if c not in df.columns:
        raise ValueError(f"Missing {c} in CSV")

# 3) Drop rows with missing values
df.dropna(subset=required_columns, inplace=True)

# Additional features? (Optional)
# feature_cols = ["Sentiment_Score","some_other_feature1","some_other_feature2",...]
feature_cols = ["Sentiment_Score"]  # minimal example using only news sentiment

X = df[feature_cols].values.astype(np.float32)

# Now define the 2-output target: [y1, y2]
# y1 = 'reddit vader sentiment'
# y2 = 'Options % Spike'
y = df[["reddit vader sentiment","Options % Spike"]].values.astype(np.float32)

# 4) Train/val/test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test     = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 5) Scaling input features (optional but typical)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

# If desired, scale the outputs too:
# from sklearn.preprocessing import MinMaxScaler
# y_scaler = MinMaxScaler()
# y_train  = y_scaler.fit_transform(y_train)
# y_val    = y_scaler.transform(y_val)
# y_test   = y_scaler.transform(y_test)


In [None]:
class MultiOutputDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Convert to tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
X_val_t   = torch.tensor(X_val,   dtype=torch.float32)
y_val_t   = torch.tensor(y_val,   dtype=torch.float32)
X_test_t  = torch.tensor(X_test,  dtype=torch.float32)
y_test_t  = torch.tensor(y_test,  dtype=torch.float32)

train_dataset = MultiOutputDataset(X_train_t, y_train_t)
val_dataset   = MultiOutputDataset(X_val_t,   y_val_t)
test_dataset  = MultiOutputDataset(X_test_t,  y_test_t)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)


In [None]:
class MultiOutputModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(MultiOutputModel, self).__init__()
        layers = []
        in_dim = input_dim
        for _ in range(num_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim
        # final layer with 2 outputs:
        layers.append(nn.Linear(in_dim, 2))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


In [None]:
loss_fn = nn.MSELoss()
# or custom weighted approach:
# e.g. Weighted MSE or sum of separate MSE for each output


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_model(trial, input_dim):
    """Builds the model using trial parameters."""
    hidden_dim = trial.suggest_int("hidden_dim", 32, 256, step=32)
    num_layers = trial.suggest_int("num_layers", 1, 4)
    dropout = trial.suggest_float("dropout", 0.0, 0.5, step=0.1)

    model = MultiOutputModel(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        dropout=dropout
    )
    return model

def objective(trial):
    # Create model with hyperparams
    model = create_model(trial, input_dim=X_train.shape[1]).to(device)

    # Suggest learning rate
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    # Suggest weight decay
    wd = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    criterion = nn.MSELoss()

    # Train for a few epochs (can adjust)
    EPOCHS = 15

    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            preds = model(batch_x)
            loss = criterion(preds, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch_x.size(0)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for val_x, val_y in val_loader:
                val_x, val_y = val_x.to(device), val_y.to(device)
                val_preds = model(val_x)
                v_loss = criterion(val_preds, val_y)
                val_loss += v_loss.item() * val_x.size(0)

        # Early stopping or partial usage:
        average_val_loss = val_loss / len(val_dataset)
        # We can tell Optuna about the intermediate result:
        trial.report(average_val_loss, epoch)

        # Could do early stop if not improving
        if trial.should_prune():
            raise optuna.TrialPruned()

    return average_val_loss

# ----------------------------------------------------------------------
# RUN OPTUNA STUDY
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # or more trials for thorough search

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)


In [None]:
best_params = study.best_trial.params
print("Best hyperparameters found by Optuna:", best_params)

# Rebuild the model with best hyperparams
best_model = MultiOutputModel(
    input_dim = X_train.shape[1],
    hidden_dim = best_params["hidden_dim"],
    num_layers = best_params["num_layers"],
    dropout = best_params["dropout"]
).to(device)

optimizer = optim.AdamW(best_model.parameters(), lr=best_params["lr"], weight_decay=best_params["weight_decay"])
criterion = nn.MSELoss()

# Optionally merge train & val for final training
X_trainval = np.concatenate([X_train, X_val], axis=0)
y_trainval = np.concatenate([y_train, y_val], axis=0)
trainval_dataset = MultiOutputDataset(
    torch.tensor(X_trainval,dtype=torch.float32),
    torch.tensor(y_trainval,dtype=torch.float32)
)
trainval_loader = DataLoader(trainval_dataset,batch_size=batch_size,shuffle=True)

# Train final model
EPOCHS_FINAL = 200
for epoch in range(1, EPOCHS_FINAL+1):
    best_model.train()
    train_loss = 0.0
    for bx, by in trainval_loader:
        bx, by = bx.to(device), by.to(device)
        optimizer.zero_grad()
        out = best_model(bx)
        loss = criterion(out, by)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * bx.size(0)
    print(f"Epoch {epoch}: TrainLoss={(train_loss/len(trainval_dataset)):.4f}")

# Evaluate on test set
best_model.eval()
test_loss = 0.0
preds_list, truth_list = [], []
with torch.no_grad():
    for tx, ty in test_loader:
        tx = tx.to(device)
        outputs = best_model(tx)
        preds_list.append(outputs.cpu().numpy())
        truth_list.append(ty.numpy())

preds_arr = np.concatenate(preds_list, axis=0)  # shape: [num_samples, 2]
truth_arr = np.concatenate(truth_list, axis=0)  # shape: [num_samples, 2]

mse = np.mean((preds_arr - truth_arr)**2)
print(f"Final Test MSE: {mse:.4f}")

# Optionally compute correlation for each output dimension:
from scipy.stats import pearsonr
corr_reddit = pearsonr(preds_arr[:,0], truth_arr[:,0])[0]
corr_options = pearsonr(preds_arr[:,1], truth_arr[:,1])[0]
print(f"Correlation for Reddit Sentiment: {corr_reddit:.4f}")
print(f"Correlation for Options Spike:    {corr_options:.4f}")


In [None]:
import pandas as pd
import yfinance as yf

def fetch_trading_data(ticker, start_date, end_date):
    """
    Fetches trading data for a given ticker and date range, and processes column names.

    Args:
        ticker (str): The stock ticker symbol.
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.

    Returns:
        pd.DataFrame: DataFrame containing processed trading data with 'Date', 'close_today', and 'close_prev_day' columns.
    """
    try:
        print(f"Fetching trading data for ticker: {ticker}, Range: {start_date} to {end_date}")
        stock_data = yf.download(ticker, start=start_date, end=end_date)

        # Reset the index to make 'Date' a column
        stock_data.reset_index(inplace=True)

        # Flatten any MultiIndex columns (in case they occur)
        stock_data.columns = [
            col[0] if isinstance(col, tuple) else col for col in stock_data.columns
        ]

        # Ensure the 'Date' column is datetime
        stock_data['Date'] = pd.to_datetime(stock_data['Date'])

        # Add the previous trading day's close
        stock_data['close_prev_day'] = stock_data['Close'].shift(1)

        # Rename 'Close' to 'close_today'
        stock_data.rename(columns={'Close': 'close_today'}, inplace=True)

        # Add ticker column for merging
        stock_data['ticker'] = ticker

        return stock_data[['Date', 'ticker', 'close_today', 'close_prev_day']]
    except Exception as e:
        print(f"Error fetching trading data for {ticker}: {e}")
        return pd.DataFrame(columns=['Date', 'ticker', 'close_today', 'close_prev_day'])

def process_ticker_data(ticker, cleaned_final, start_date, end_date):
    """
    Fetch and process trading data for a single ticker, and merge it with the original dataset.

    Args:
        ticker (str): The stock ticker symbol.
        cleaned_final (pd.DataFrame): Original dataset with 'ticker' and 'Formatted_Date' columns.
        start_date (str): Start date for fetching data.
        end_date (str): End date for fetching data.

    Returns:
        pd.DataFrame: Updated DataFrame with 'close_today' and 'close_prev_day' columns merged.
    """
    # Ensure 'ticker' column exists in cleaned_final
    if 'ticker' not in cleaned_final.columns:
        raise KeyError(f"'ticker' column is missing in the input DataFrame.")

    # Fetch trading data for the ticker
    stock_data = fetch_trading_data(ticker, start_date, end_date)

    # Debugging: Print structure of both DataFrames
    print(f"Structure of cleaned_final before merge for ticker {ticker}:")
    print(cleaned_final.info())
    print(cleaned_final.head())

    print(f"Structure of stock_data for ticker {ticker}:")
    print(stock_data.info())
    print(stock_data.head())

    # Merge the trading data with the cleaned_final DataFrame
    cleaned_final = pd.merge(
        cleaned_final,
        stock_data,
        left_on=['Formatted_Date', 'ticker'],
        right_on=['Date', 'ticker'],
        how='left'
    )

    # Drop the redundant 'Date' column after the merge
    cleaned_final.drop(columns=['Date'], inplace=True)

    return cleaned_final

# Example usage
start_date = "2010-01-01"
end_date = "2025-01-01"

# Replace this with your actual cleaned_final DataFrame
# Ensure 'Formatted_Date' is in datetime format and 'ticker' is present
cleaned_final = pd.DataFrame({
    'Formatted_Date': pd.to_datetime(['2023-01-03', '2023-01-04', '2023-01-05']),
    'ticker': ['AAPL', 'AAPL', 'AAPL']
})

# Process a specific ticker
ticker = "AAPL"
try:
    cleaned_final = process_ticker_data(ticker, cleaned_final, start_date, end_date)
except KeyError as e:
    print(f"Error: {e}")

# Debugging: Inspect the final merged DataFrame
print("Final cleaned_final after merge:")
print(cleaned_final.info())
print(cleaned_final.head())


In [None]:

# Ensure the Formatted_Date column is in datetime format
cleaned_final['Formatted_Date'] = pd.to_datetime(cleaned_final['Formatted_Date'], format='mixed')

# Get the unique tickers and date range
unique_tickers = cleaned_final['ticker'].unique()
start_date = cleaned_final['Formatted_Date'].min()
end_date = cleaned_final['Formatted_Date'].max()

# Define a function to fetch trading data
def fetch_trading_data(ticker, start_date, end_date):
    try:
        print(f"Fetching trading data for ticker: {ticker}, Range: {start_date} to {end_date}")
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        if not stock_data.empty:
            stock_data.reset_index(inplace=True)
            stock_data['Date'] = pd.to_datetime(stock_data['Date'])
            return stock_data[['Date', 'Close']]
        else:
            print(f"No trading data found for {ticker} in the specified range.")
            return pd.DataFrame(columns=['Date', 'Close'])
    except Exception as e:
        print(f"Error fetching trading data for {ticker}: {e}")
        return pd.DataFrame(columns=['Date', 'Close'])

# Create a DataFrame to hold all tickers' trading data
all_trading_data = pd.DataFrame()

# Fetch trading data for all tickers
for ticker in unique_tickers:
    ticker_data = fetch_trading_data(ticker, start_date, end_date)
    if not ticker_data.empty:
        ticker_data['ticker'] = ticker
        all_trading_data = pd.concat([all_trading_data, ticker_data], ignore_index=True)

# Rename columns for clarity
all_trading_data.rename(columns={'Date': 'Formatted_Date', 'Close': 'close_today'}, inplace=True)

# Sort trading data by ticker and date
all_trading_data.sort_values(by=['ticker', 'Formatted_Date'], inplace=True)

# Add the close_prev_day column
all_trading_data['close_prev_day'] = (
    all_trading_data.groupby('ticker')['close_today']
    .shift(1)
    .reset_index(drop=True)  # Fix to avoid multi-column errors
)

# Merge the trading data back into the cleaned_final DataFrame
merged_data = pd.merge(
    cleaned_final,
    all_trading_data,
    on=['ticker', 'Formatted_Date'],
    how='left'
)

# Save the merged dataset
merged_data.to_csv("cleaned_final_with_closing_prices.csv", index=False)
print("Data successfully processed and saved to 'cleaned_final_with_closing_prices.csv'.")

In [None]:
import pandas as pd
import yfinance as yf

# Load the existing dataset
cleaned_final = pd.read_csv("/content/drive/MyDrive/FYP/cleaned_final.csv")

# Ensure the date column is properly formatted
cleaned_final['Formatted_Date'] = pd.to_datetime(cleaned_final['Formatted_Date'], format='mixed')

# Filter for Apple (AAPL) only
aapl_data = cleaned_final[cleaned_final['ticker'] == 'AAPL'].copy()

# Fetch Apple historical stock data
start_date = aapl_data['Formatted_Date'].min()
end_date = aapl_data['Formatted_Date'].max()

# Download data from Yahoo Finance
aapl_stock_data = yf.download("AAPL", start=start_date, end=end_date)
aapl_stock_data.reset_index(inplace=True)

# Flatten MultiIndex columns in aapl_stock_data
aapl_stock_data.columns = [
    f"{col[0]}_{col[1]}" if isinstance(col, tuple) and col[1] else col[0]
    for col in aapl_stock_data.columns
]

# Debugging: Print column names after flattening
print("Flattened columns in aapl_stock_data:", aapl_stock_data.columns)

# Rename 'Close_AAPL' to 'close_today' for clarity
aapl_stock_data.rename(columns={'Close_AAPL': 'close_today'}, inplace=True)

# Add previous day's close
aapl_stock_data['close_prev_day'] = aapl_stock_data['close_today'].shift(1)

# Debugging: Check the structure of aapl_stock_data after processing
print(aapl_stock_data.head())

# Merge the new data back into aapl_data
aapl_data = pd.merge(
    aapl_data,
    aapl_stock_data[['Date', 'close_today', 'close_prev_day']],
    left_on='Formatted_Date',
    right_on='Date',
    how='left'
)

# Drop the extra Date column from the merge
#aapl_data.drop(columns=['Date'], inplace=True)

# Save the updated data to a new CSV
output_path = "aapl_with_closing_prices.csv"  # Replace with desired output path
aapl_data.to_csv(output_path, index=False)

print(f"Updated AAPL data saved to {output_path}")


In [None]:
import pandas as pd
import yfinance as yf

# Load the existing dataset
cleaned_final = pd.read_csv("/content/drive/MyDrive/FYP/cleaned_final.csv")

# Ensure the date column is properly formatted
cleaned_final['Formatted_Date'] = pd.to_datetime(cleaned_final['Formatted_Date'], format='mixed')

# Collect processed data from each ticker
processed_data = []

# Get the unique tickers
tickers = cleaned_final['ticker'].unique()

for ticker in tickers:
    print(f"Processing {ticker}...")

    # 1. Filter rows for the current ticker
    ticker_data = cleaned_final[cleaned_final['ticker'] == ticker].copy()

    # 2. Figure out the date range for that ticker
    start_date = ticker_data['Formatted_Date'].min()
    end_date = ticker_data['Formatted_Date'].max()

    # 3. Download historical data from Yahoo Finance
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    stock_data.reset_index(inplace=True)

    # 4. Flatten any MultiIndex columns (in case yfinance returned them)
    stock_data.columns = [
        f"{col[0]}_{col[1]}" if isinstance(col, tuple) and col[1] else col[0]
        for col in stock_data.columns
    ]
    print(f"Flattened columns in {ticker} stock data:", stock_data.columns)

    # 5. Rename 'Close_<ticker>' to 'close_today' to match your single‐ticker logic
    old_close_col = f"Close_{ticker}"
    if old_close_col in stock_data.columns:
        stock_data.rename(columns={old_close_col: 'close_today'}, inplace=True)
    else:
        # If, for some reason, the exact name 'Close_<ticker>' doesn't exist, you can decide what to do:
        print(f"Warning: '{old_close_col}' not found. Check whether yfinance data matches the expected column naming.")
        # Fallback is optional; if you truly need it to be 'Close_<ticker>', just continue or raise an error.

    # 6. Add the previous day's close
    if 'close_today' in stock_data.columns:
        stock_data['close_prev_day'] = stock_data['close_today'].shift(1)
    else:
        stock_data['close_prev_day'] = None

    # 7. Merge this back into your ticker_data on the matching dates
    ticker_data = pd.merge(
        ticker_data,
        stock_data[['Date', 'close_today', 'close_prev_day']],
        left_on='Formatted_Date',
        right_on='Date',
        how='left'
    )

    # 8. Drop the extra 'Date' column, if desired
    # In your original code it’s commented out, so leave it that way if you want the same logic:
    # ticker_data.drop(columns=['Date'], inplace=True)

    # 9. Accumulate the result
    processed_data.append(ticker_data)

# 10. Concatenate all ticker slices into one final DataFrame
final_data = pd.concat(processed_data, ignore_index=True)

# 11. Save the merged dataset to a single CSV
output_path = "/content/drive/MyDrive/FYP/all_tickers_with_closing_prices.csv"
final_data.to_csv(output_path, index=False)

print(f"Done! Combined data with close/prev_close columns is in: {output_path}")


In [None]:
final_data=final_data.dropna()
output_path =  "/content/drive/MyDrive/FYP/all_tickers_with_closing_prices.csv"
final_data.to_csv(output_path, index=False)

In [None]:
data = final_data


data.dropna(subset=['close_today', 'close_prev_day'], inplace=True)

# Calculate the momentum
data['momentum'] = ((data['close_today'] - data['close_prev_day']) / data['close_prev_day']) * 100

# Save the updated DataFrame to a new CSV (optional)
output_path = "data_with_momentum.csv"  # Replace with your desired file path
data.to_csv(output_path, index=False)

print(f"Momentum calculated and saved to {output_path}")


Redoing the Model with both momentum and news sentiment, striving for better correlation

In [None]:
!pip install optuna torch tqdm scikit-learn
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


In [None]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/FYP/data_with_momentum.csv")

# Ensure the needed columns exist
required_cols = ["Sentiment_Score", "reddit vader sentiment", "Options % Spike", "momentum"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing column: {col}")

# Drop any rows that have NaN in these columns
df.dropna(subset=required_cols, inplace=True)

# Prepare features (NOW including momentum)
feature_cols = ["Sentiment_Score", "momentum"]
X = df[feature_cols].values.astype(np.float32)

# Prepare multi-output target: [reddit_sentiment, options_spike]
y = df[["reddit vader sentiment", "Options % Spike"]].values.astype(np.float32)

# Train / Val / Test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test     = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

# Optionally, one could also scale the outputs if desired, e.g., MinMaxScaler.
# For simplicity, leaving y unscaled here.


In [None]:
class MultiOutputDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Convert arrays to PyTorch tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
X_val_t   = torch.tensor(X_val,   dtype=torch.float32)
y_val_t   = torch.tensor(y_val,   dtype=torch.float32)
X_test_t  = torch.tensor(X_test,  dtype=torch.float32)
y_test_t  = torch.tensor(y_test,  dtype=torch.float32)

train_dataset = MultiOutputDataset(X_train_t, y_train_t)
val_dataset   = MultiOutputDataset(X_val_t,   y_val_t)
test_dataset  = MultiOutputDataset(X_test_t,  y_test_t)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)


In [None]:
class MultiOutputModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(MultiOutputModel, self).__init__()
        layers = []
        in_dim = input_dim

        for _ in range(num_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim

        # Final layer: outputs 2 values for 2-target regression
        layers.append(nn.Linear(in_dim, 2))

        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_model(trial, input_dim):
    hidden_dim = trial.suggest_int("hidden_dim", 32, 256, step=32)
    num_layers = trial.suggest_int("num_layers", 1, 4)
    dropout    = trial.suggest_float("dropout", 0.0, 0.5, step=0.1)

    model = MultiOutputModel(
        input_dim  = input_dim,
        hidden_dim = hidden_dim,
        num_layers = num_layers,
        dropout    = dropout
    ).to(device)
    return model

def objective(trial):
    model = create_model(trial, input_dim=len(feature_cols))

    # Suggest learning rate and weight decay
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    wd = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    criterion = nn.MSELoss()

    EPOCHS = 15  # can adjust
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            preds = model(batch_x)
            loss = criterion(preds, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch_x.size(0)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for val_x, val_y in val_loader:
                val_x, val_y = val_x.to(device), val_y.to(device)
                val_preds = model(val_x)
                v_loss = criterion(val_preds, val_y)
                val_loss += v_loss.item() * val_x.size(0)

        avg_val_loss = val_loss / len(val_dataset)
        trial.report(avg_val_loss, epoch)

        # Optional pruning
        if trial.should_prune():
            raise optuna.TrialPruned()

    return avg_val_loss

# Run the study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # Increase n_trials for a more exhaustive search


In [None]:
best_params = study.best_trial.params
print("Best hyperparameters found by Optuna:", best_params)

# Rebuild model with best hyperparams
best_model = MultiOutputModel(
    input_dim=len(feature_cols),
    hidden_dim=best_params["hidden_dim"],
    num_layers=best_params["num_layers"],
    dropout=best_params["dropout"]
).to(device)

optimizer = optim.AdamW(
    best_model.parameters(),
    lr=best_params["lr"],
    weight_decay=best_params["weight_decay"]
)
criterion = nn.MSELoss()

# Combine train + val for final training if you like
X_trainval = np.concatenate([X_train, X_val], axis=0)
y_trainval = np.concatenate([y_train, y_val], axis=0)

trainval_dataset = MultiOutputDataset(
    torch.tensor(X_trainval, dtype=torch.float32),
    torch.tensor(y_trainval, dtype=torch.float32)
)
trainval_loader = DataLoader(trainval_dataset, batch_size=batch_size, shuffle=True)

EPOCHS_FINAL = 20
for epoch in range(1, EPOCHS_FINAL+1):
    best_model.train()
    total_loss = 0.0
    for bx, by in trainval_loader:
        bx, by = bx.to(device), by.to(device)
        optimizer.zero_grad()
        out = best_model(bx)
        loss = criterion(out, by)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * bx.size(0)

    avg_loss = total_loss / len(trainval_dataset)
    print(f"Epoch {epoch}/{EPOCHS_FINAL} - Train Loss: {avg_loss:.4f}")

print("Final training completed.")


In [None]:
best_model.eval()
preds_list = []
truth_list = []
with torch.no_grad():
    for tx, ty in test_loader:
        tx = tx.to(device)
        outputs = best_model(tx)
        preds_list.append(outputs.cpu().numpy())
        truth_list.append(ty.numpy())

preds_arr = np.concatenate(preds_list, axis=0)  # shape [N, 2]
truth_arr = np.concatenate(truth_list, axis=0)  # shape [N, 2]

mse = np.mean((preds_arr - truth_arr)**2)
print(f"Final Test MSE (joint): {mse:.4f}")

# Evaluate correlation for each output dimension
from scipy.stats import pearsonr

# preds_arr[:,0] vs. truth_arr[:,0] => correlation for reddit sentiment
corr_reddit = pearsonr(preds_arr[:,0], truth_arr[:,0])[0]
# preds_arr[:,1] vs. truth_arr[:,1] => correlation for options spike
corr_options = pearsonr(preds_arr[:,1], truth_arr[:,1])[0]

print(f"Correlation (Reddit sentiment): {corr_reddit:.4f}")
print(f"Correlation (Options spike):    {corr_options:.4f}")


Models split by sector 

In [None]:
# Example ticker-to-sector map as provided
ticker_to_sector = {
    # Communication Services
    'DIS': 'Communication Services', 'NFLX': 'Communication Services', 'CMCSA': 'Communication Services',
    'TMUS': 'Communication Services', 'GOOGL': 'Communication Services', 'CHTR': 'Communication Services',

    # Consumer Discretionary
    'AMZN': 'Consumer Discretionary', 'TSLA': 'Consumer Discretionary', 'NKE': 'Consumer Discretionary',
    'HD': 'Consumer Discretionary', 'AZO': 'Consumer Discretionary', 'YUM': 'Consumer Discretionary',

    # Consumer Staples
    'WMT': 'Consumer Staples', 'KO': 'Consumer Staples', 'PEP': 'Consumer Staples', 'MDLZ': 'Consumer Staples',
    'PG': 'Consumer Staples', 'KHC': 'Consumer Staples', 'MO': 'Consumer Staples',

    # Energy
    'XOM': 'Energy', 'CVX': 'Energy', 'COP': 'Energy', 'NEM': 'Energy',

    # Financials
    'BAC': 'Financials', 'JPM': 'Financials', 'C': 'Financials', 'MA': 'Financials', 'V': 'Financials',
    'WFC': 'Financials', 'CB': 'Financials',

    # Health Care
    'JNJ': 'Health Care', 'PFE': 'Health Care', 'ABBV': 'Health Care', 'MRK': 'Health Care',
    'LLY': 'Health Care', 'GILD': 'Health Care', 'MDT': 'Health Care', 'UNH': 'Health Care',

    # Industrials
    'GE': 'Industrials', 'BA': 'Industrials', 'UPS': 'Industrials', 'MMM': 'Industrials',
    'DELL': 'Industrials',

    # Information Technology
    'AAPL': 'Information Technology', 'MSFT': 'Information Technology', 'NVDA': 'Information Technology',
    'QCOM': 'Information Technology', 'ADBE': 'Information Technology', 'INTC': 'Information Technology',
    'CSCO': 'Information Technology', 'TXN': 'Information Technology', 'IBM': 'Information Technology',
    'ORCL': 'Information Technology', 'PYPL': 'Information Technology', 'AVGO': 'Information Technology',
    'AMD': 'Information Technology', 'ADP': 'Information Technology', 'INTU': 'Information Technology',

    # Materials
    'LIN': 'Materials', 'DHR': 'Materials',

    # Real Estate
    'PSA': 'Real Estate',

    # Utilities
    'DUK': 'Utilities', 'ED': 'Utilities', 'NEE': 'Utilities', 'PPL': 'Utilities',
}

df = pd.read_csv("/content/drive/MyDrive/FYP/data_with_momentum.csv")

# Add 'sector' column
df['sector'] = df['ticker'].map(ticker_to_sector)

# Filter out any rows with missing sector or missing numeric columns
required_cols = ['Sentiment_Score','reddit vader sentiment','Options % Spike','momentum','sector']
df.dropna(subset=required_cols, inplace=True)


In [None]:
class MultiOutputDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
class MultiOutputModel(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=64, num_layers=2, dropout=0.2):
        super(MultiOutputModel, self).__init__()
        layers = []
        in_dim = input_dim
        for _ in range(num_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim
        # final layer output = 2
        layers.append(nn.Linear(in_dim, 2))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


In [None]:
def train_model_for_sector(df_sector, feature_cols, device, epochs=20, batch_size=256):
    """
    Trains a multi-output model on the data for one sector.
    Returns (model, scaler, X_val, y_val, X_test, y_test).
    """

    # Extract input features X and multi-output Y
    X = df_sector[feature_cols].values.astype(np.float32)
    y = df_sector[["reddit vader sentiment","Options % Spike"]].values.astype(np.float32)

    # Train/Val split (80/20). For more robust approach, do 70/15/15 etc.
    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=42)

    # Scale input features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val   = scaler.transform(X_val)

    # Convert to torch
    X_train_t = torch.tensor(X_train,dtype=torch.float32)
    y_train_t = torch.tensor(y_train,dtype=torch.float32)
    X_val_t   = torch.tensor(X_val,  dtype=torch.float32)
    y_val_t   = torch.tensor(y_val,  dtype=torch.float32)

    # Datasets
    train_ds = MultiOutputDataset(X_train_t, y_train_t)
    val_ds   = MultiOutputDataset(X_val_t,   y_val_t)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)

    # Build model (hard-coded hyperparams or define your own logic)
    input_dim= len(feature_cols)
    model = MultiOutputModel(input_dim=input_dim, hidden_dim=128, num_layers=3, dropout=0.3).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)

    # Training loop
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        for bx, by in train_loader:
            bx, by = bx.to(device), by.to(device)
            optimizer.zero_grad()
            preds = model(bx)
            loss = criterion(preds, by)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * bx.size(0)
        train_mse = total_loss / len(train_ds)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for vx, vy in val_loader:
                vx, vy = vx.to(device), vy.to(device)
                vpreds = model(vx)
                vloss  = criterion(vpreds, vy)
                val_loss += vloss.item() * vx.size(0)
        val_mse = val_loss / len(val_ds)

        print(f"[Epoch {epoch}] TrainMSE={train_mse:.4f}  ValMSE={val_mse:.4f}")

    # Return everything needed
    return model, scaler


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Choose the feature columns (including momentum)
feature_cols = ["Sentiment_Score", "momentum"]

# We can identify the unique sectors present in the DataFrame
unique_sectors = df['sector'].dropna().unique()

# Dictionary to hold the results
sector_models = {}

EPOCHS = 20
BATCH_SIZE = 256

for sec in unique_sectors:
    print(f"\nTraining sector '{sec}'")
    df_sector = df[df['sector'] == sec].copy()
    if len(df_sector) < 100:
        print(f"  Not enough data in sector {sec}, skipping...")
        continue

    model, scaler = train_model_for_sector(
        df_sector=df_sector,
        feature_cols=feature_cols,
        device=device,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE
    )

    # Store the model + scaler in a dict
    sector_models[sec] = {
        'model': model,
        'scaler': scaler
    }

print("\nAll sectors processed. Models stored in sector_models.")


In [None]:
def evaluate_sector_model(df_sector_test, feature_cols, model, scaler, device):
    """ Evaluate an already-trained sector model on a sector's test set.
        Returns MSE, correlation for both outputs, etc.
    """
    if len(df_sector_test) == 0:
        return None

    X_test = df_sector_test[feature_cols].values.astype(np.float32)
    y_test = df_sector_test[["reddit vader sentiment","Options % Spike"]].values.astype(np.float32)

    # Scale X
    X_test = scaler.transform(X_test)
    X_test_t = torch.tensor(X_test, dtype=torch.float32)
    y_test_t = torch.tensor(y_test, dtype=torch.float32)

    model.eval()
    preds = []
    truths= []
    with torch.no_grad():
        batch_size = 256
        for i in range(0, len(X_test_t), batch_size):
            bx = X_test_t[i:i+batch_size].to(device)
            by = y_test_t[i:i+batch_size]
            out = model(bx).cpu().numpy()  # shape (N,2)
            preds.append(out)
            truths.append(by.numpy())
    preds_arr = np.concatenate(preds, axis=0)
    truth_arr = np.concatenate(truths, axis=0)

    # Compute MSE
    mse = np.mean((preds_arr - truth_arr)**2)

    # Correlations for each dimension
    from scipy.stats import pearsonr
    corr_reddit = pearsonr(preds_arr[:,0], truth_arr[:,0])[0]
    corr_options= pearsonr(preds_arr[:,1], truth_arr[:,1])[0]

    return mse, corr_reddit, corr_options

# Example usage, if you had a separate df_test for each sector
# for sec, items in sector_models.items():
#     df_sect_test = df_test[df_test["sector"]==sec]
#     res = evaluate_sector_model(df_sect_test, feature_cols, items['model'], items['scaler'], device)
#     if res is not None:
#         print(sec, res)


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from scipy.stats import pearsonr

################################################################################
# 1) Ticker-to-Sector Mapping
################################################################################
ticker_to_sector = {
    # Communication Services
    'DIS': 'Communication Services', 'NFLX': 'Communication Services', 'CMCSA': 'Communication Services',
    'TMUS': 'Communication Services', 'GOOGL': 'Communication Services', 'CHTR': 'Communication Services',

    # Consumer Discretionary
    'AMZN': 'Consumer Discretionary', 'TSLA': 'Consumer Discretionary', 'NKE': 'Consumer Discretionary',
    'HD': 'Consumer Discretionary', 'AZO': 'Consumer Discretionary', 'YUM': 'Consumer Discretionary',

    # Consumer Staples
    'WMT': 'Consumer Staples', 'KO': 'Consumer Staples', 'PEP': 'Consumer Staples', 'MDLZ': 'Consumer Staples',
    'PG': 'Consumer Staples', 'KHC': 'Consumer Staples', 'MO': 'Consumer Staples',

    # Energy
    'XOM': 'Energy', 'CVX': 'Energy', 'COP': 'Energy', 'NEM': 'Energy',

    # Financials
    'BAC': 'Financials', 'JPM': 'Financials', 'C': 'Financials', 'MA': 'Financials', 'V': 'Financials',
    'WFC': 'Financials', 'CB': 'Financials',

    # Health Care
    'JNJ': 'Health Care', 'PFE': 'Health Care', 'ABBV': 'Health Care', 'MRK': 'Health Care',
    'LLY': 'Health Care', 'GILD': 'Health Care', 'MDT': 'Health Care', 'UNH': 'Health Care',

    # Industrials
    'GE': 'Industrials', 'BA': 'Industrials', 'UPS': 'Industrials', 'MMM': 'Industrials',
    'DELL': 'Industrials',

    # Information Technology
    'AAPL': 'Information Technology', 'MSFT': 'Information Technology', 'NVDA': 'Information Technology',
    'QCOM': 'Information Technology', 'ADBE': 'Information Technology', 'INTC': 'Information Technology',
    'CSCO': 'Information Technology', 'TXN': 'Information Technology', 'IBM': 'Information Technology',
    'ORCL': 'Information Technology', 'PYPL': 'Information Technology', 'AVGO': 'Information Technology',
    'AMD': 'Information Technology', 'ADP': 'Information Technology', 'INTU': 'Information Technology',

    # Materials
    'LIN': 'Materials', 'DHR': 'Materials',

    # Real Estate
    'PSA': 'Real Estate',

    # Utilities
    'DUK': 'Utilities', 'ED': 'Utilities', 'NEE': 'Utilities', 'PPL': 'Utilities',
}

################################################################################
# 2) Load the CSV and Map Tickers to Sectors
################################################################################
df = pd.read_csv("/content/drive/MyDrive/FYP/data_with_momentum.csv")

# Create sector column
df['sector'] = df['ticker'].map(ticker_to_sector)

# Ensure required columns exist
required_cols = ['ticker','Sentiment_Score','reddit vader sentiment','Options % Spike','momentum','sector']
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Drop rows with missing values in these columns
df.dropna(subset=required_cols, inplace=True)

# (Optional) If some rows have no mapped sector, they become NaN in 'sector'
df = df.dropna(subset=['sector'])

################################################################################
# 3) Dataset & Multi-Output Model Definitions
################################################################################
class MultiOutputDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class MultiOutputModel(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=64, num_layers=2, dropout=0.2):
        super(MultiOutputModel, self).__init__()
        layers = []
        in_dim = input_dim
        for _ in range(num_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim
        # final layer has 2 outputs: [RedditSent, OptionsSpike]
        layers.append(nn.Linear(in_dim, 2))
        self.net = nn.Sequential(*layers)
    def forward(self,x):
        return self.net(x)

################################################################################
# 4) Train/Eval Function for One Sector
################################################################################
def train_and_evaluate_sector(df_sector, feature_cols, device, epochs=20, batch_size=256):
    """
    Splits the sector's data into train/val (80/20), trains a multi-output model,
    evaluates on val set, and returns final stats + the trained model + scaler.
    """

    # Prepare inputs (X) and outputs (y)
    X = df_sector[feature_cols].values.astype(np.float32)
    y = df_sector[["reddit vader sentiment","Options % Spike"]].values.astype(np.float32)

    # Train/Val split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale inputs
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val   = scaler.transform(X_val)

    # Convert to torch tensors
    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    y_train_t = torch.tensor(y_train, dtype=torch.float32)
    X_val_t   = torch.tensor(X_val,   dtype=torch.float32)
    y_val_t   = torch.tensor(y_val,   dtype=torch.float32)

    train_ds = MultiOutputDataset(X_train_t, y_train_t)
    val_ds   = MultiOutputDataset(X_val_t,   y_val_t)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)

    # Build model
    input_dim = len(feature_cols)
    model = MultiOutputModel(input_dim=input_dim, hidden_dim=128, num_layers=3, dropout=0.3).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)

    # Train Loop
    for epoch in range(1, epochs+1):
        model.train()
        train_loss_sum = 0.0
        for bx, by in train_loader:
            bx, by = bx.to(device), by.to(device)
            optimizer.zero_grad()
            preds = model(bx)
            loss = criterion(preds, by)
            loss.backward()
            optimizer.step()
            train_loss_sum += loss.item() * bx.size(0)
        train_mse = train_loss_sum / len(train_ds)

        # Validation pass
        model.eval()
        val_loss_sum = 0.0
        preds_val_list = []
        truth_val_list = []
        with torch.no_grad():
            for vx, vy in val_loader:
                vx = vx.to(device)
                vy = vy.to(device)
                vpreds = model(vx)
                vloss  = criterion(vpreds, vy)
                val_loss_sum += vloss.item() * vx.size(0)
                preds_val_list.append(vpreds.cpu().numpy())
                truth_val_list.append(vy.cpu().numpy())
        val_mse = val_loss_sum / len(val_ds)

        print(f"Epoch {epoch}/{epochs}: Train MSE={train_mse:.4f}, Val MSE={val_mse:.4f}")

    # Final evaluation on val set: correlation
    preds_arr = np.concatenate(preds_val_list, axis=0)  # shape [N,2]
    truth_arr = np.concatenate(truth_val_list, axis=0)  # shape [N,2]

    # MSE
    final_mse = np.mean((preds_arr - truth_arr)**2)
    # Pearson correlations
    corr_reddit   = pearsonr(preds_arr[:,0], truth_arr[:,0])[0] if len(preds_arr)>1 else 0
    corr_options  = pearsonr(preds_arr[:,1], truth_arr[:,1])[0] if len(preds_arr)>1 else 0

    print(f"[Final Validation Results] MSE={final_mse:.4f}, Reddit Corr={corr_reddit:.4f}, Options Corr={corr_options:.4f}")

    # Return
    return {
        "model": model,
        "scaler": scaler,
        "val_mse": final_mse,
        "val_corr_reddit": corr_reddit,
        "val_corr_options": corr_options,
    }

################################################################################
# 5) Sector-Wise Loop
################################################################################
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # Features (including momentum)
    feature_cols = ["Sentiment_Score", "momentum"]

    # Identify unique sectors
    unique_sectors = df["sector"].unique()

    # Dictionary to store models/results
    sector_models = {}

    EPOCHS = 20
    BATCH_SIZE = 256

    for sector_name in unique_sectors:
        df_sector = df[df["sector"] == sector_name]
        print(f"\n==============================")
        print(f"Training sector: {sector_name}")
        print(f"Rows in sector: {len(df_sector)}")

        if len(df_sector) < 50:
            print("Not enough data; skipping.")
            continue

        results = train_and_evaluate_sector(
            df_sector      = df_sector,
            feature_cols   = feature_cols,
            device         = device,
            epochs         = EPOCHS,
            batch_size     = BATCH_SIZE
        )

        sector_models[sector_name] = results

    print("\nAll sectors processed. Here is a summary:\n")
    for sec, res in sector_models.items():
        print(f"S E C T O R: {sec}")
        print(f"  Val MSE:          {res['val_mse']:.4f}")
        print(f"  Corr (Reddit):    {res['val_corr_reddit']:.4f}")
        print(f"  Corr (Options):   {res['val_corr_options']:.4f}")
        print("-"*40)

if __name__ == "__main__":
    main()


Classification Model

In [None]:
# Install required libraries if not already installed
!pip install pandas numpy scikit-learn xgboost matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/FYP/data_with_momentum.csv")

# Drop missing values
df = df.dropna()

# Define RSI using weighted sum of Options % Spike and Reddit Vader Sentiment
df["RSI"] = (0.8 * df["Options % Spike"]) + (0.2 * df["reddit vader sentiment"])

# Convert RSI into a binary classification target
df["RSI_label"] = np.where(df["RSI"] > 0, 1, 0)

# Print class distribution
print(df["RSI_label"].value_counts(normalize=True))


In [None]:
# Define input features and target
feature_cols = ["Sentiment_Score", "momentum"]
X = df[feature_cols]
y = df["RSI_label"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80-20 stratified)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# Evaluate accuracy
acc_lr = accuracy_score(y_test, y_pred_lr)
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"Logistic Regression Accuracy: {acc_lr:.4f}")
print(f"Random Forest Accuracy: {acc_rf:.4f}")
print(f"XGBoost Accuracy: {acc_xgb:.4f}")


In [None]:
def evaluate_model(model_name, y_true, y_pred):
    print(f"\n--- {model_name} Model Evaluation ---")
    print(classification_report(y_true, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative RSI", "Positive RSI"], yticklabels=["Negative RSI", "Positive RSI"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"{model_name} Confusion Matrix")
    plt.show()

# Evaluate all models
evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("XGBoost", y_test, y_pred_xgb)


In [None]:
from sklearn.metrics import roc_curve

plt.figure(figsize=(8, 6))
for model, y_pred, name in zip([lr, rf, xgb], [y_pred_lr, y_pred_rf, y_pred_xgb], ["Logistic Regression", "Random Forest", "XGBoost"]):
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]):.2f})")

plt.plot([0, 1], [0, 1], linestyle="--", color="gray")  # Random baseline
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-AUC Curve")
plt.legend()
plt.show()


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define MLP Model
mlp_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile Model
mlp_model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

# Train Model
mlp_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate
y_pred_mlp = (mlp_model.predict(X_test) > 0.5).astype("int32")
acc_mlp = accuracy_score(y_test, y_pred_mlp)
print(f"MLP Accuracy: {acc_mlp:.4f}")


In [None]:
# Install required libraries if not already installed
%pip install tpot --upgrade

# Import necessary modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tpot import TPOTClassifier

# Load dataset (Replace with actual dataset path)
df = pd.read_csv("/content/drive/MyDrive/FYP/data_with_momentum.csv")

# Ensure necessary columns exist
required_cols = ["momentum", "Sentiment_Score", "Options % Spike", "reddit vader sentiment"]
assert all(col in df.columns for col in required_cols), "Missing required columns!"

# Scale down 'Options % Spike' before RSI computation
df["Scaled_Options_Spike"] = df["Options % Spike"] / 10  # Adjust scaling factor if needed

# Compute RSI using a weighted formula
df["RSI_value"] = 0.7 * df["Scaled_Options_Spike"] + 0.3 * df["reddit vader sentiment"]

# Convert RSI into a binary classification (1 if RSI > 0, else 0)
df["RSI_label"] = np.where(df["RSI_value"] > 0, 1, 0)

# Define input features for classification model (momentum and sentiment score)
feature_cols = ["momentum", "Sentiment_Score"]

# Normalize input features
scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])

# Split dataset into train and test sets
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["RSI_label"]
)

# Define feature matrices and target vectors
X_train = train_df[feature_cols]
y_train = train_df["RSI_label"]
X_test = test_df[feature_cols]
y_test = test_df["RSI_label"]

# Initialize and Train the TPOT Classifier
tpot = TPOTClassifier(
    generations=10,
    population_size=50,
    verbosity=2,
    n_jobs=-1,
    random_state=42
)
tpot.fit(X_train, y_train)

# Evaluate Best Model Found by TPOT
y_pred_tpot = tpot.predict(X_test)
acc_tpot = accuracy_score(y_test, y_pred_tpot)
print(f"AutoML TPOT Accuracy: {acc_tpot:.4f}")

# Display Classification Report
print("Classification Report for TPOT Model:")
print(classification_report(y_test, y_pred_tpot))

# Save Best Model Pipeline
tpot.export("best_model_pipeline.py")


In [None]:

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/FYP/data_with_momentum.csv")

# Drop missing values
df = df.dropna()

# Define RSI using weighted sum of Options % Spike and Reddit Vader Sentiment
df["RSI"] = (0.8 * df["Options % Spike"]) + (0.2 * df["reddit vader sentiment"])

# Convert RSI into a binary classification target
df["RSI_label"] = np.where(df["RSI"] > 0, 1, 0)

# Print class distribution
print(df["RSI_label"].value_counts(normalize=True))

# Define input features and target
feature_cols = ["Sentiment_Score", "momentum"]
X = df[feature_cols]
y = df["RSI_label"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_cols)  # Convert back to DataFrame for interpretability

# Train-test split (80-20 stratified)
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42, stratify=y)

# Initialize models
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)

# Train models
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Predictions
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_xgb = xgb.predict(X_test)

# Evaluate accuracy
acc_lr = accuracy_score(y_test, y_pred_lr)
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"Logistic Regression Accuracy: {acc_lr:.4f}")
print(f"Random Forest Accuracy: {acc_rf:.4f}")
print(f"XGBoost Accuracy: {acc_xgb:.4f}")

# Function to plot feature importance
def plot_feature_importance(model, feature_names, model_name):
    if hasattr(model, "feature_importances_"):  # Works for Random Forest
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        plt.figure(figsize=(6, 4))
        sns.barplot(x=importances[indices], y=np.array(feature_names)[indices], palette="Blues_r")
        plt.xlabel("Feature Importance Score")
        plt.ylabel("Features")
        plt.title(f"{model_name} Feature Importance")
        plt.show()

# Feature Importance for Random Forest
plot_feature_importance(rf, feature_cols, "Random Forest")

# Feature Importance for XGBoost
def plot_xgb_importance(model, feature_names):
    importance_dict = model.get_booster().get_score(importance_type='weight')
    importance_df = pd.DataFrame({"Feature": feature_names, "Importance": [importance_dict.get(f"f{i}", 0) for i in range(len(feature_names))]})
    importance_df = importance_df.sort_values(by="Importance", ascending=False)

    plt.figure(figsize=(6, 4))
    sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], palette="Oranges_r")
    plt.xlabel("Feature Importance Score")
    plt.ylabel("Features")
    plt.title("XGBoost Feature Importance")
    plt.show()

plot_xgb_importance(xgb, feature_cols)

# SHAP Feature Importance (For Explainability)
def plot_shap(model, X_sample, model_name):
    explainer = shap.Explainer(model)
    shap_values = explainer(X_sample)

    plt.figure(figsize=(6, 4))
    shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False)
    plt.title(f"{model_name} SHAP Feature Importance")
    plt.show()

# SHAP for XGBoost (More Detailed Feature Importance)
plot_shap(xgb, X_train, "XGBoost")

# Model Evaluation Function
def evaluate_model(model_name, y_true, y_pred):
    print(f"\n--- {model_name} Model Evaluation ---")
    print(classification_report(y_true, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative RSI", "Positive RSI"], yticklabels=["Negative RSI", "Positive RSI"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"{model_name} Confusion Matrix")
    plt.show()

# Evaluate all models
evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("XGBoost", y_test, y_pred_xgb)


In [None]:
import pandas as pd
import numpy as np
import shap
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/FYP/data_with_momentum.csv")
df = df.dropna()  # Drop missing values

# Define RSI using weighted sum of Options % Spike and Reddit Vader Sentiment
df["RSI"] = (0.8 * df["Options % Spike"]) + (0.2 * df["reddit vader sentiment"])

# Convert RSI into a binary classification target
df["RSI_label"] = np.where(df["RSI"] > 0, 1, 0)

# Define input features and target
feature_cols = ["Sentiment_Score", "momentum"]
X = df[feature_cols]
y = df["RSI_label"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_cols)  # Convert back to DataFrame for SHAP

# Train-test split (80-20 stratified)
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42, stratify=y)

# Initialize models
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)

# Train models
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Predictions
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_xgb = xgb.predict(X_test)

# Evaluate accuracy
acc_lr = accuracy_score(y_test, y_pred_lr)
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"Logistic Regression Accuracy: {acc_lr:.4f}")
print(f"Random Forest Accuracy: {acc_rf:.4f}")
print(f"XGBoost Accuracy: {acc_xgb:.4f}")

# Train MLP Model
mlp_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

mlp_model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

mlp_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=0)

# Evaluate MLP
y_pred_mlp = (mlp_model.predict(X_test) > 0.5).astype("int32")
acc_mlp = accuracy_score(y_test, y_pred_mlp)
print(f"MLP Accuracy: {acc_mlp:.4f}")

# --- SHAP Analysis for All Models ---

# Function to plot SHAP values
def plot_shap_summary(shap_values, X_sample, model_name):
    plt.figure(figsize=(6, 4))
    shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False)
    plt.title(f"{model_name} SHAP Feature Importance")
    plt.show()

# SHAP for Logistic Regression
explainer_lr = shap.Explainer(lr, X_train)
shap_values_lr = explainer_lr(X_test)
plot_shap_summary(shap_values_lr, X_test, "Logistic Regression")

# SHAP for Random Forest
explainer_rf = shap.Explainer(rf)
shap_values_rf = explainer_rf(X_test)
plot_shap_summary(shap_values_rf, X_test, "Random Forest")

# SHAP for XGBoost
explainer_xgb = shap.Explainer(xgb)
shap_values_xgb = explainer_xgb(X_test)
plot_shap_summary(shap_values_xgb, X_test, "XGBoost")

# SHAP for MLP (Deep Learning Model)
explainer_mlp = shap.DeepExplainer(mlp_model, X_train[:50])  # Use a small batch to approximate SHAP values
shap_values_mlp = explainer_mlp.shap_values(X_test[:50])

plt.figure(figsize=(6, 4))
shap.summary_plot(shap_values_mlp[0], X_test[:50], plot_type="bar", show=False)
plt.title("MLP SHAP Feature Importance")
plt.show()


In [None]:
# --- SHAP Analysis for All Models ---
feature_names = X_test.columns  # Store feature names

# Function to plot SHAP values safely
def plot_shap_summary(shap_values, X_sample, model_name):
    plt.figure(figsize=(6, 4))
    shap.summary_plot(shap_values, X_sample, feature_names=feature_names, plot_type="bar", show=False)
    plt.title(f"{model_name} SHAP Feature Importance")
    plt.show()

# SHAP for Logistic Regression
explainer_lr = shap.Explainer(lr, X_train)
shap_values_lr = explainer_lr(X_test.to_numpy())  # Convert DataFrame to NumPy
plot_shap_summary(shap_values_lr, X_test.to_numpy(), "Logistic Regression")

# SHAP for Random Forest (Fixed)
explainer_rf = shap.Explainer(rf, X_train)
shap_values_rf = explainer_rf(X_test.to_numpy())  # Convert DataFrame to NumPy
plot_shap_summary(shap_values_rf, X_test.to_numpy(), "Random Forest")

# SHAP for XGBoost
explainer_xgb = shap.Explainer(xgb)
shap_values_xgb = explainer_xgb(X_test.to_numpy())
plot_shap_summary(shap_values_xgb, X_test.to_numpy(), "XGBoost")

# SHAP for MLP (Deep Learning Model)
explainer_mlp = shap.DeepExplainer(mlp_model, X_train[:50].to_numpy())  # Use a small batch
shap_values_mlp = explainer_mlp.shap_values(X_test[:50].to_numpy())

plt.figure(figsize=(6, 4))
shap.summary_plot(shap_values_mlp[0], X_test[:50].to_numpy(), feature_names=feature_names, plot_type="bar", show=False)
plt.title("MLP SHAP Feature Importance")
plt.show()


In [None]:
import shap

# Initialize SHAP explainers for all models
explainer_lr = shap.Explainer(lr, X_train)
explainer_rf = shap.KernelExplainer(rf.predict, X_train[:100])  # Use a subset for efficiency
explainer_xgb = shap.Explainer(xgb)
explainer_mlp = shap.DeepExplainer(mlp_model, X_train[:50].to_numpy())  # MLP requires a small training batch

# Compute SHAP values
shap_values_lr = explainer_lr(X_test.to_numpy())
shap_values_rf = explainer_rf.shap_values(X_test[:50].to_numpy())  # Random Forest (subset)
shap_values_xgb = explainer_xgb(X_test.to_numpy())
shap_values_mlp = explainer_mlp.shap_values(X_test[:50].to_numpy())  # MLP

# Function to plot SHAP values safely
def plot_shap_summary(shap_values, X_sample, model_name):
    plt.figure(figsize=(6, 4))
    shap.summary_plot(shap_values, X_sample, feature_names=feature_names, plot_type="bar", show=False)
    plt.title(f"{model_name} SHAP Feature Importance")
    plt.show()

# Plot SHAP values for all models
plot_shap_summary(shap_values_lr, X_test.to_numpy(), "Logistic Regression")
plot_shap_summary(shap_values_rf, X_test[:50].to_numpy(), "Random Forest")
plot_shap_summary(shap_values_xgb, X_test.to_numpy(), "XGBoost")
plot_shap_summary(shap_values_mlp, X_test[:50].to_numpy(), "MLP Neural Network")  # No [0] index


In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

# Convert X_test back to a NumPy array (SHAP for MLP requires NumPy)
X_test_np = np.array(X_test)

# Use SHAP's DeepExplainer for neural networks
explainer_mlp = shap.Explainer(mlp_model, X_train)  # Use training data to initialize
shap_values_mlp = explainer_mlp(X_test_np)

# Plot summary of SHAP values
plt.figure(figsize=(6, 4))
shap.summary_plot(shap_values_mlp, X_test_np, feature_names=feature_cols, show=False)
plt.title("MLP SHAP Feature Importance")
plt.show()
