<a href="https://colab.research.google.com/github/Kgan3039/stock-prediction-bert/blob/main/BertModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets scikit-learn torch


In [None]:
import os
#disable WAndB
os.environ["WANDB_MODE"] = "disabled"
# List all files in the current working directory
print(os.listdir())


In [None]:
import torch

print(os.getcwd())
print(torch.cuda.is_available())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
import pandas as pd

# Reload the train dataset without headers, skipping the first row
train_df = pd.read_csv('stock_data1.csv', header=0)

# Rename columns to reflect their content correctly
train_df.columns = ['text', 'label']

# Drop any row that still contains the column names if it exists
train_df = train_df[train_df['label'] != 'Sentiment']

# Ensure the label column is numeric
train_df['label'] = pd.to_numeric(train_df['label'], errors='coerce')

# Drop rows with NaN labels after conversion
train_df = train_df.dropna(subset=['label']).astype({'label': 'int'})

# Load the validation dataset and rename columns appropriately
val_df = pd.read_csv('twitter_validation.csv', header=None)
val_df = val_df.rename(columns={val_df.columns[2]: 'label', val_df.columns[3]: 'text'})

# Drop unnecessary columns (0 and 1)
val_df = val_df[['text', 'label']]

# Apply label mapping to convert string labels to numeric labels
label_mapping = {
    'Neutral': 0,
    'Positive': 1,
    'Negative': -1,
    'Irrelevant': 2
}
val_df['label'] = val_df['label'].replace(label_mapping)

# Ensure the label column is numeric
val_df['label'] = pd.to_numeric(val_df['label'], errors='coerce')

# Drop rows with NaN labels after conversion
val_df = val_df.dropna(subset=['label']).astype({'label': 'int'})

# Check the size of the filtered dataset
print("Filtered train_df size:", len(train_df))
print("Filtered val_df size:", len(val_df))

# Ensure dataset is not empty before conversion
if len(train_df) == 0 or len(val_df) == 0:
    raise ValueError("Filtered dataset is still empty. Please double-check your dataset and filtering criteria.")

# Remap labels from {-1, 1} to {0, 1}
train_df['label'] = train_df['label'].replace({-1: 0, 1: 1})
val_df['label'] = val_df['label'].replace({-1: 0, 1: 1})

# Filter out any unexpected labels that aren't 0 or 1
train_df = train_df[train_df['label'].isin([0, 1])]
val_df = val_df[val_df['label'].isin([0, 1])]

# Verify the label distribution
print("Train label distribution after remapping:\n", train_df['label'].value_counts())
print("Validation label distribution after remapping:\n", val_df['label'].value_counts())

# Convert to HuggingFace dataset
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Print an example to verify dataset conversion
if len(train_dataset) > 0:
    print("First item in train_dataset:", train_dataset[0])
else:
    raise ValueError("train_dataset is empty after conversion.")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load tokenizer
model_ckpt = "bert-base-uncased"  # Specify model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Tokenize the dataset
def tokenize_function(examples):
   return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)  # Increase max_length if needed

# Tokenize the training and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Verify the label set
print("Unique labels in train_dataset:", set(train_dataset['label']))
print("Unique labels in val_dataset:", set(val_dataset['label']))

# Set the number of labels
num_labels = 2

# Load pre-trained BERT model and adjust for sequence classification
try:
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)
except Exception as e:
    print("Error during model loading:", e)
    raise

#Freeze base model layers
for name, param in model.named_parameters():
    if "layer.10" in name or "layer.11" in name:  # Unfreeze last two layers
        param.requires_grad = True
    elif "classifier" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False



In [None]:
from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:
# Ensure labels are numeric
num_labels = len(set(train_dataset['label']))

# Model definition
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)

# Freezing base model parameters to fine-tune classifier only
for param in model.base_model.parameters():
    param.requires_grad = False

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,  # Reduce learning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
)


In [None]:
from torch.nn import CrossEntropyLoss
from transformers import Trainer
import torch

# Subclassing Trainer to override compute_loss with weighted cross-entropy
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Apply weighted cross-entropy loss
        class_weights = torch.tensor([0.5, 1.5]).to(self.args.device)  # Adjust weights accordingly
        loss_fn = CrossEntropyLoss(weight=class_weights)
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Load model before initializing the Trainer
num_labels = len(set(train_dataset['label']))
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)
model = model.to(device)

# Ensure base model layers are frozen (optional)
for param in model.base_model.parameters():
    param.requires_grad = False

# Initialize the custom Trainer with the overridden compute_loss function
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Print label information and start training
print("Number of labels:", num_labels)
print("Label set in train_dataset:", set(train_dataset['label']))

try:
    trainer.train()
except RuntimeError as e:
    print("RuntimeError during training:", e)
    print("Consider using CUDA_LAUNCH_BLOCKING=1 for better debug messages.")


In [None]:
for name, param in model.named_parameters():
  print(f"Layer: {name} | Requires Grad: {param.requires_grad}")

In [None]:
trainer.evaluate()


In [None]:
model.save_pretrained('./finetuned-bert-model')
tokenizer.save_pretrained('./finetuned-bert-tokenizer')


In [None]:
sentiment_predictions = []

In [None]:
import yfinance as yf
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
ticker = "TSLA"
start_date = "2023-01-01"
end_date = "2023-05-01"
stock_data = yf.download(ticker, start=start_date, end=end_date)
stock_data = stock_data[['Adj Close']].reset_index()
stock_data.rename(columns={'Adj Close': 'adj_close'}, inplace=True)


In [None]:
def calculate_weighted_sentiment(df):
    df['normalized_retweets'] = df['retweets'] / df['retweets'].max()
    df['weighted_sentiment'] = df['normalized_retweets'] * df['sentiment']
    daily_sentiment = df.groupby('date')['weighted_sentiment'].sum().reset_index()
    return daily_sentiment

In [None]:
# Assuming `sentiment_predictions` is a list of dictionaries, you need to convert it to a DataFrame
# Example:
# sentiment_predictions = [{'date': '2023-01-01', 'retweets': 10, 'sentiment': 1}, ...]

# Convert to DataFrame if it's not already one
if isinstance(sentiment_predictions, list):
    sentiment_predictions = pd.DataFrame(sentiment_predictions)

# Check if `sentiment_predictions` is now a DataFrame
print(sentiment_predictions.head())

# Function to calculate weighted sentiment
def calculate_weighted_sentiment(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input data must be a pandas DataFrame")

    # Ensure required columns are present
    if 'retweets' not in df.columns or 'sentiment' not in df.columns or 'date' not in df.columns:
        raise ValueError("Data must include 'date', 'retweets', and 'sentiment' columns.")

    # Normalize retweet counts
    df['normalized_retweets'] = df['retweets'] / df['retweets'].max()

    # Weighted sentiment score = normalized retweets * sentiment
    df['weighted_sentiment'] = df['normalized_retweets'] * df['sentiment']

    # Aggregate by date to get the daily weighted sentiment
    daily_sentiment = df.groupby('date')['weighted_sentiment'].sum().reset_index()
    return daily_sentiment

daily_sentiment = calculate_weighted_sentiment(sentiment_predictions)

merged_data = pd.merge(stock_data, daily_sentiment, left_on='Date', right_on='date', how='inner')
merged_data = merged_data[['Date', 'adj_close', 'weighted_sentiment']]

In [None]:
lookback_window = 5
for i in range(1, lookback_window + 1):
    merged_data[f'adj_close_lag_{i}'] = merged_data['adj_close'].shift(i)
    merged_data[f'weighted_sentiment_lag_{i}'] = merged_data['weighted_sentiment'].shift(i)
merged_data = merged_data.dropna()

# Define the target: 1 if price increased, 0 if decreased
merged_data['price_direction'] = (merged_data['adj_close'].diff().shift(-1) > 0).astype(int)
features = [col for col in merged_data.columns if 'lag' in col]
X = merged_data[features]
y = merged_data['price_direction']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Stock Prediction Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f}")