# 1. SETUP & IMPORTS
Preprocess the text data appropriately for each model (webscraped tweets are messy – there are html elements, hyperlinks and unicode – make informed decisions what to keep and what to remove).

In [1]:
import os
import gzip
import pandas as pd
import numpy as np
import re
import string
from bs4 import BeautifulSoup
import nltk
nltk.download()
#!pip install emoji
from tqdm import tqdm
from transformers import pipeline
from sklearn.utils import resample
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
import joblib
import random
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from transformers import pipeline
from sklearn.utils import resample

# 2. Process Data

In [5]:
# load gzip CSV
def load_gzip_csv(path):
    with gzip.open(path, 'rt',encoding="ISO-8859-1") as f:
        return pd.read_csv(f)

In [6]:
from google.colab import files
uploaded = files.upload()
#btc_tweets_test.parquet.gzip(application/x-gzip) - 72285 bytes, last modified: 07/07/2025 - 100% done
#btc_tweets_train.parquet.gzip(application/x-gzip) - 192913 bytes, last modified: 07/07/2025 - 100% done

Saving btc_tweets_test.parquet.gzip to btc_tweets_test.parquet (4).gzip
Saving btc_tweets_train.parquet.gzip to btc_tweets_train.parquet (4).gzip


In [7]:
# Load data

def load_gzip_csv_robust(path):
    with gzip.open(path, 'rt', encoding="ISO-8859-1") as f:
        return pd.read_csv(f, sep=',', on_bad_lines='skip')

# Load parquet file
train_df = pd.read_parquet("btc_tweets_train.parquet.gzip")
test_df = pd.read_parquet("btc_tweets_test.parquet.gzip")

In [8]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, 1641579121972236290 to 1641861677149822976
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   hashtags          1500 non-null   object
 1   content           1500 non-null   object
 2   username          1500 non-null   object
 3   user_displayname  1500 non-null   object
 4   sentiment         1500 non-null   bool  
dtypes: bool(1), object(4)
memory usage: 60.1+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 1641861708246552576 to 1641953216999968769
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   hashtags          500 non-null    object
 1   content           500 non-null    object
 2   username          500 non-null    object
 3   user_displayname  500 non-null    object
 4   sentiment         500 non-null    bool  
dtypes: bool(1), object(4)
memory usage

In [9]:
train_df.head()

Unnamed: 0_level_0,hashtags,content,username,user_displayname,sentiment
tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1641579121972236290,"[Bitcoin, Bitcoin, BTC, Bitcoin, BTC, SHIB, HO...","$Bitcoin TO $100,000 SOONER THAN YOU THINK‼️💯🙏...",BezosCrypto,SHIB Bezos,True
1641579176171016194,"[Bitcoin, bitcoinordinals, crypto]",Alright I have my rares. Who else is grabbing ...,spartantc81,SpartanTC,True
1641579486071390208,"[BTC, SHIB, HOGE, SAITAMA, BNB, DOGE, ETH, Bab...","Bitcoin (BTC) Targets Over $100,000 as This Im...",BezosCrypto,SHIB Bezos,True
1641579537103302656,[BTC],📢 Xverse Web-based pool is live:\n\n•Update @x...,godfred_xcuz,Algorithm.btc,True
1641579588399804418,[Bitcoin],"Yesterday, a Bitcoin projection was displayed ...",goddess81oo,she is lucky,True


In [10]:
test_df.head()

Unnamed: 0_level_0,hashtags,content,username,user_displayname,sentiment
tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1641861708246552576,"[crypto, btc]",#crypto $crypto #btc \nI am Chinese crypto alp...,huahuayjy,花花研究院 | Crypto Alpha🇨🇳,True
1641861783898972167,"[Bitcoin, Bitcoin]",#Bitcoin would have to fall another 80% to rea...,luke_broyles,Luke Broyles,False
1641862152532418562,"[Giveaway, BTC, SolanaGiveaways, Giveaway, Air...",#Giveaway $1000 Matic in 3Days\n\n🏆To win\n1️⃣...,cryptomarsdo,Crypto Mars,True
1641862338369183753,"[EOS, USDT, BTC, crypto, Bitcoin, etherium, Bi...",Up or Down?\n\n!!! $EOS #EOS !!!\n\nVS\n\n$USD...,andreyukrnet,Andrey Ukraine,True
1641862430434131968,"[BTC, ETH, BSC, GroveToken]",Mid Day Mix-up is LIVE! Never know who might s...,JustAman04,Justin Anderson,True


In [11]:
class TweetPreprocessor:
    """
    Cleans up tweets for NLP tasks.
    - Removes HTML, URLs, unicode artifacts.
    - Handles emojis: Optionally keeps, removes, or converts them to text.
    - Removes user mentions.
    - Optionally removes hashtags.
    - Converts text to lowercase and strips punctuation.
    """

    URL_PATTERN = re.compile(r"http\S+|www\S+")
    MENTION_PATTERN = re.compile(r"@\w+")
    HASHTAG_PATTERN = re.compile(r"#\w+")
    MULTISPACE_PATTERN = re.compile(r"\s+")

    def __init__(
        self,
        lowercase: bool = True,
        remove_urls: bool = True,
        remove_html: bool = True,
        remove_unicode: bool = True,
        remove_mentions: bool = True,
        remove_hashtags: bool = False,
        emoji_handling: str = 'keep',  # 'keep', 'remove', or 'convert'
    ):
        self.lowercase = lowercase
        self.remove_urls = remove_urls
        self.remove_html = remove_html
        self.remove_unicode = remove_unicode
        self.remove_mentions = remove_mentions
        self.remove_hashtags = remove_hashtags
        assert emoji_handling in ['keep', 'remove', 'convert'], "emoji_handling must be 'keep', 'remove', or 'convert'"
        self.emoji_handling = emoji_handling

    def _remove_html(self, text):
        return BeautifulSoup(text, "html.parser").get_text()

    def _remove_unicode(self, text):
        # Remove unicode escape sequences (e.g., \u1234, \x12)
        text = re.sub(r"\\u[\dA-Fa-f]{4}", " ", text)
        text = re.sub(r"\\x[\dA-Fa-f]{2}", " ", text)
        # Remove non-printable and non-ascii except emojis (for most cases)
        return ''.join(c for c in text if c.isprintable())

    def _handle_emoji(self, text):
        if self.emoji_handling == 'remove':
            return emoji.replace_emoji(text, replace='')
        elif self.emoji_handling == 'convert':
            return emoji.demojize(text, delimiters=(" ", " "))
        else:  # keep
            return text

    def clean(self, text: str) -> str:
        t = text

        if self.remove_html:
            t = self._remove_html(t)
        if self.remove_urls:
            t = self.URL_PATTERN.sub(" ", t)
        if self.remove_unicode:
            t = self._remove_unicode(t)
        if self.remove_mentions:
            t = self.MENTION_PATTERN.sub(" ", t)
        if self.remove_hashtags:
            t = self.HASHTAG_PATTERN.sub(" ", t)
        t = self._handle_emoji(t)
        if self.lowercase:
            t = t.lower()
        t = t.translate(str.maketrans('', '', string.punctuation))
        t = self.MULTISPACE_PATTERN.sub(" ", t)
        return t.strip()

In [12]:
def clean_hashtags(hashtags):
    # Remove duplicates, lowercase, strip punctuation
    if isinstance(hashtags, list):
        clean_set = set()
        for tag in hashtags:
            clean = tag.replace('#', '').strip().lower()
            clean = re.sub(r'[^\w\s]', '', clean)
            if clean:
                clean_set.add(clean)
        return ' '.join(clean_set)  # join to make a string
    return ''

In [13]:
# Preprocess tweets
preproc = TweetPreprocessor()
train_df['clean_tweet'] = train_df['content'].apply(preproc.clean)
test_df['clean_tweet'] = test_df['content'].apply(preproc.clean)

#process Hashtags
train_df['clean_hashtags'] = train_df['hashtags'].apply(clean_hashtags)
test_df['clean_hashtags'] = test_df['hashtags'].apply(clean_hashtags)


train_df['final_text'] = train_df['clean_tweet'] + ' ' + train_df['clean_hashtags']
test_df['final_text'] = test_df['clean_tweet'] + ' ' + test_df['clean_hashtags']


train_df.head()
test_df.head()

Unnamed: 0_level_0,hashtags,content,username,user_displayname,sentiment,clean_tweet,clean_hashtags,final_text
tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1641861708246552576,"[crypto, btc]",#crypto $crypto #btc \nI am Chinese crypto alp...,huahuayjy,花花研究院 | Crypto Alpha🇨🇳,True,crypto crypto btc i am chinese crypto alpha al...,,crypto crypto btc i am chinese crypto alpha al...
1641861783898972167,"[Bitcoin, Bitcoin]",#Bitcoin would have to fall another 80% to rea...,luke_broyles,Luke Broyles,False,bitcoin would have to fall another 80 to reach...,,bitcoin would have to fall another 80 to reach...
1641862152532418562,"[Giveaway, BTC, SolanaGiveaways, Giveaway, Air...",#Giveaway $1000 Matic in 3Days\n\n🏆To win\n1️⃣...,cryptomarsdo,Crypto Mars,True,giveaway 1000 matic in 3days🏆to win1️⃣ follow ...,,giveaway 1000 matic in 3days🏆to win1️⃣ follow ...
1641862338369183753,"[EOS, USDT, BTC, crypto, Bitcoin, etherium, Bi...",Up or Down?\n\n!!! $EOS #EOS !!!\n\nVS\n\n$USD...,andreyukrnet,Andrey Ukraine,True,up or down eos eos vsusdt usdt andbtc btc cryp...,,up or down eos eos vsusdt usdt andbtc btc cryp...
1641862430434131968,"[BTC, ETH, BSC, GroveToken]",Mid Day Mix-up is LIVE! Never know who might s...,JustAman04,Justin Anderson,True,mid day mixup is live never know who might sto...,,mid day mixup is live never know who might sto...


# 3. Apply sentiment dictionary VADER

In [14]:
class VaderSentimentPredictor:
    """Uses NLTK's VADER to score sentiment."""

    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()

    def predict(self, texts):
        """Return binary sentiment: 1 if compound>=0 else 0."""
        return [int(self.analyzer.polarity_scores(t)['compound'] >= 0) for t in texts]

vader_model = VaderSentimentPredictor()
vader_preds = vader_model.predict(test_df['final_text'])

In [15]:
# Add VADER predictions to the test_df
test_df['vader_sentiment_pred'] = vader_preds

print(accuracy_score(test_df['sentiment'], test_df['vader_sentiment_pred']))
print(classification_report(test_df['sentiment'], test_df['vader_sentiment_pred']))

0.83
              precision    recall  f1-score   support

       False       0.57      0.45      0.50        96
        True       0.88      0.92      0.90       404

    accuracy                           0.83       500
   macro avg       0.72      0.68      0.70       500
weighted avg       0.82      0.83      0.82       500



**1. Overall Accuracy: 0.83**:  model correctly predicted 83% of the sentiments in your test set of 500 tweets.

**2. For Each Class:**

**False** (Negative sentiment, 96 samples)
- **Precision: 0.57**: Of all tweets predicted as negative, 57% were actually negative.
- **Recall: 0.45**: Of all true negative tweets, 45% were correctly identified as negative.
- **F1-score: 0.50**: Harmonic mean of precision and recall (balanced metric).

**True** (Positive sentiment, 404 samples)
- **Precision: 0.88**: Of all tweets predicted as positive, 88% were actually positive.
- **Recall: 0.92**: Of all true positive tweets, 92% were correctly identified as positive.
- **F1-score: 0.90**: Very high, showing strong performance for the positive class.

**3. Averages:**

**Macro avg**:  Average of the metrics for both classes (treats both equally, regardless of class size).
  Precision: 0.72     -Recall: 0.68    -F1-score: 0.70 **Weighted avg**: Average weighted by the number of samples in each class. - Precision: 0.82.  Recall: 0.83.  F1-score: 0.82


**4. Support:**  ***96*** negative (False), **404** positive (True) in your test set—so your data is imbalanced (many more positives).

**5. Interpretation:**

- **Model is much better at predicting positive sentiment (“True”) than negative (“False”).**
- **Negative tweets are often misclassified as positive.**
- **Good overall accuracy, but performance on minority class (negative) is much lower.**
    - This is common when data is imbalanced.
- **Consider looking at ROC-AUC, confusion matrix, or balancing your data if negative sentiment is important for your application.**


# 4. Fit a TF-IDF vectorizer and Logistic Regression(balenced).

In [16]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=3000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=3000, class_weight='balanced', solver='saga'))
])

param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_features': [2000, 3000, 5000],
    'clf__C': [0.1, 1, 10],
    'clf__penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(train_df['final_text'], train_df['sentiment'])

print("Best Parameters:", grid_search.best_params_)
y_pred = grid_search.predict(test_df['final_text'])

Best Parameters: {'clf__C': 10, 'clf__penalty': 'l2', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}


In [17]:
print("Accuracy:", accuracy_score(test_df['sentiment'], y_pred))
print(classification_report(test_df['sentiment'], y_pred))

# Save the best model
joblib.dump(grid_search.best_estimator_, 'best_model.pkl')

# Load the model for future use
best_model = joblib.load('best_model.pkl')

Accuracy: 0.822
              precision    recall  f1-score   support

       False       0.53      0.56      0.55        96
        True       0.89      0.88      0.89       404

    accuracy                           0.82       500
   macro avg       0.71      0.72      0.72       500
weighted avg       0.83      0.82      0.82       500



# 5. Train an LSTM including embeddings ( a validation split for training progress monitoring and early stopping).

In [64]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [19]:
# Vocabulary and encoding
def build_vocab(texts, min_freq=1):
    tokens = [w for text in texts for w in text.split()]
    counter = Counter(tokens)
    vocab = {w: i+2 for i, (w, c) in enumerate(counter.items()) if c >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

def encode(text, vocab):
    return [vocab.get(w, vocab['<UNK>']) for w in text.split()]

def pad_sequence(seq, maxlen):
    seq = seq[:maxlen]
    return seq + [0] * (maxlen - len(seq))

In [20]:
# Custom PyTorch Dataset
class TextDataset(Dataset):
    def __init__(self, df, vocab, maxlen=50):
        self.encoded = np.stack([pad_sequence(encode(t, vocab), maxlen) for t in df['final_text']])
        self.labels = df['sentiment'].values.astype(np.float32)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return (
            torch.tensor(self.encoded[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.float32)
        )

In [21]:
# Define the LSTM Classifier
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128, num_layers=1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
    def forward(self, x):
        x = self.embed(x)
        _, (h, _) = self.lstm(x)
        out = h[-1]
        return self.fc(out).squeeze(1)  # logits (no sigmoid)

In [22]:
# Combine text for vocab building
maxlen = 50

all_texts = pd.concat([train_df['final_text'], test_df['final_text']])
vocab = build_vocab(all_texts)
vocab_size = len(vocab)

train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,
    stratify=train_df['sentiment'],
    random_state=42
)
train_ds = TextDataset(train_df.iloc[train_idx], vocab, maxlen)
val_ds = TextDataset(train_df.iloc[val_idx], vocab, maxlen)
test_ds = TextDataset(test_df, vocab, maxlen)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)
test_loader = DataLoader(test_ds, batch_size=32)

In [23]:
# Training loop with early stopping
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNClassifier(vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()  # use logits for stability

best_val_loss = float('inf')
patience, patience_counter = 3, 0
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    train_losses = []
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    train_loss = np.mean(train_losses)

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            val_losses.append(loss.item())
    val_loss = np.mean(val_losses)
    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f} val_loss={val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_rnn.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping!")
            break


Epoch 1: train_loss=0.5551 val_loss=0.4932
Epoch 2: train_loss=0.4732 val_loss=0.4724
Epoch 3: train_loss=0.4577 val_loss=0.4635
Epoch 4: train_loss=0.4453 val_loss=0.4637
Epoch 5: train_loss=0.4176 val_loss=0.4562
Epoch 6: train_loss=0.3588 val_loss=0.5669
Epoch 7: train_loss=0.3153 val_loss=0.4528
Epoch 8: train_loss=0.2683 val_loss=0.5022
Epoch 9: train_loss=0.2236 val_loss=0.6225
Epoch 10: train_loss=0.1862 val_loss=0.5364
Early stopping!


In [24]:
# Load best model and evaluate on test set
model.load_state_dict(torch.load('best_rnn.pt'))

model.eval()
test_preds = []
with torch.no_grad():
    for x, _ in test_loader:
        x = x.to(device)
        logits = model(x)
        probs = torch.sigmoid(logits)
        test_preds.extend(probs.cpu().numpy())

# Convert probabilities to binary predictions (threshold = 0.5)
test_pred_labels = [1 if p >= 0.5 else 0 for p in test_preds]

In [25]:
true_labels = test_df['sentiment'].values

# Evaluate
print("Confusion Matrix:")
print(confusion_matrix(true_labels, test_pred_labels))

print("\nClassification Report:")
print(classification_report(true_labels, test_pred_labels, target_names=['True', 'False']))

accuracy = accuracy_score(true_labels, test_pred_labels)
print(f"\nAccuracy: {accuracy:.4f}")

Confusion Matrix:
[[ 27  69]
 [ 64 340]]

Classification Report:
              precision    recall  f1-score   support

        True       0.30      0.28      0.29        96
       False       0.83      0.84      0.84       404

    accuracy                           0.73       500
   macro avg       0.56      0.56      0.56       500
weighted avg       0.73      0.73      0.73       500


Accuracy: 0.7340


# 6. Train LSTM using pre-trained GloVe embeddings.

In [26]:
from google.colab import files
uploaded = files.upload() # importing glove.6B.100d.txt: https://nlp.stanford.edu/projects/glove/

Saving glove.6B.100d.txt to glove.6B.100d (1).txt


In [27]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(42)

In [28]:
# Load GloVe embeddings
def load_glove(path, embedding_dim=100):
    embeddings = {}
    with open(path, encoding="utf8") as f:
        for line in f:
            parts = line.rstrip().split(" ")
            word = parts[0]
            vec = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vec
    return embeddings

glove_path = "glove.6B.100d.txt"
embedding_dim = 100
glove = load_glove(glove_path, embedding_dim)


In [29]:
# Build vocab and embedding matrix
def build_vocab(texts, min_freq=1):
    tokens = [w for text in texts for w in text.split()]
    counter = Counter(tokens)
    vocab = {w: i+2 for i, (w, c) in enumerate(counter.items()) if c >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

all_texts = pd.concat([train_df['final_text'], test_df['final_text']])
vocab = build_vocab(all_texts)
vocab_size = len(vocab)

embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)
for word, idx in vocab.items():
    if word in glove:
        embedding_matrix[idx] = glove[word]
    elif word == '<PAD>':
        embedding_matrix[idx] = np.zeros(embedding_dim)
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [None]:
# Encoding and Dataset
def encode(text, vocab):
    return [vocab.get(w, vocab['<UNK>']) for w in text.split()]

def pad_sequence(seq, maxlen):
    seq = seq[:maxlen]
    return seq + [0] * (maxlen - len(seq))

class TextDataset(Dataset):
    def __init__(self, df, vocab, maxlen=50):
        self.encoded = np.stack([pad_sequence(encode(t, vocab), maxlen) for t in df['final_text']])
        self.labels = df['sentiment'].values.astype(np.float32)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return (
            torch.tensor(self.encoded[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.float32)
        )


In [30]:
# Data split and DataLoader
maxlen = 50
train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,
    stratify=train_df['sentiment'],
    random_state=42
)
train_ds = TextDataset(train_df.iloc[train_idx], vocab, maxlen)
val_ds = TextDataset(train_df.iloc[val_idx], vocab, maxlen)
test_ds = TextDataset(test_df, vocab, maxlen)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)
test_loader = DataLoader(test_ds, batch_size=32)



In [31]:
# RNN Classifier with pre-trained embeddings (fine tuned)
class RNNClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, num_layers=1):
        super().__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.embed.weight.data.copy_(torch.tensor(embedding_matrix))
        self.embed.weight.requires_grad = True
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
    def forward(self, x):
        x = self.embed(x)
        _, (h, _) = self.lstm(x)
        out = h[-1]
        return self.fc(out).squeeze(1)

In [32]:
# Training loop with early stopping
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNClassifier(embedding_matrix).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

best_val_loss = float('inf')
patience, patience_counter = 3, 0
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    train_losses = []
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    train_loss = np.mean(train_losses)

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            val_losses.append(loss.item())
    val_loss = np.mean(val_losses)
    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f} val_loss={val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_rnn_glove.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping!")
            break



Epoch 1: train_loss=0.5573 val_loss=0.4823
Epoch 2: train_loss=0.4755 val_loss=0.4731
Epoch 3: train_loss=0.4643 val_loss=0.4636
Epoch 4: train_loss=0.4484 val_loss=0.4602
Epoch 5: train_loss=0.4712 val_loss=0.4761
Epoch 6: train_loss=0.4704 val_loss=0.4747
Epoch 7: train_loss=0.4498 val_loss=0.5154
Early stopping!


In [33]:
# Evaluation on test set
model.load_state_dict(torch.load('best_rnn_glove.pt'))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for x, y in test_loader:
        x = x.to(device)
        logits = model(x)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs > 0.5).astype(int)
        all_preds.extend(preds)
        all_labels.extend(y.numpy())

print("Test Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=['False', 'True']))

Test Accuracy: 0.818
              precision    recall  f1-score   support

       False       0.65      0.11      0.19        96
        True       0.82      0.99      0.90       404

    accuracy                           0.82       500
   macro avg       0.74      0.55      0.55       500
weighted avg       0.79      0.82      0.76       500



# 7. Pre-trained Transformer (Zero-shot Inference)

In [34]:
# Choose a model for tweets
sentiment_pipe = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Apply to a few samples
print(sentiment_pipe("Bitcoin is mooning!"))
print(sentiment_pipe("The market looks terrible."))

# Apply to dataset
test_df['transformer_pred'] = test_df['final_text'].apply(lambda x: sentiment_pipe(x)[0]['label'])
test_df['transformer_score'] = test_df['final_text'].apply(lambda x: sentiment_pipe(x)[0]['score'])

# Convert model labels
test_df['transformer_pred_num'] = test_df['transformer_pred'].map({'POSITIVE': 1, 'NEGATIVE': 0})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9930391907691956}]
[{'label': 'NEGATIVE', 'score': 0.9997046589851379}]


# 8. Fine-tune DistilBERT(with Hugging Face Transformers)

In [37]:
# Map sentiment to integer labels
train, val = train_test_split(train_df, test_size=0.2, stratify=train_df['sentiment'], random_state=42) # Split train_df into train and validation, stratified by label
train["labels"] = train["sentiment"].astype(int)
val["labels"] = val["sentiment"].astype(int)
test_df["labels"] = test_df["sentiment"].astype(int)
# Oversample the "False" class (label==0) in training data- otherweise predict all als true
df_majority = train[train['labels'] == 1]
df_minority = train[train['labels'] == 0]
df_minority_upsampled = resample(df_minority,
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # match majority
                                 random_state=42)
train_balanced = pd.concat([df_majority, df_minority_upsampled])
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_balanced[['final_text', 'labels']])
val_dataset = Dataset.from_pandas(val[['final_text', 'labels']])
test_dataset = Dataset.from_pandas(test_df[['final_text', 'labels']])



In [40]:
# Tokenize
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['final_text'], truncation=True, padding='max_length', max_length=64)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Training
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    logging_steps=100,
    save_strategy="no",
    learning_rate=2e-5,
    report_to="none"
)


Map:   0%|          | 0/1952 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# 7. Training
trainer.train()

Step,Training Loss,Validation Loss
100,0.5596,0.445214


TrainOutput(global_step=122, training_loss=0.5391808885042785, metrics={'train_runtime': 683.3048, 'train_samples_per_second': 2.857, 'train_steps_per_second': 0.179, 'total_flos': 32322045272064.0, 'train_loss': 0.5391808885042785, 'epoch': 1.0})

In [42]:
# Get predictions on test set
preds_output = trainer.predict(test_dataset)
test_preds = np.argmax(preds_output.predictions, axis=1)



              precision    recall  f1-score   support

           0       0.41      0.79      0.54        96
           1       0.94      0.73      0.82       404

    accuracy                           0.74       500
   macro avg       0.67      0.76      0.68       500
weighted avg       0.83      0.74      0.77       500

Confusion Matrix:
[[ 76  20]
 [110 294]]


# 9. Summary

In [57]:
summary = []

# VADER
vader_acc = accuracy_score(test_df['sentiment'], test_df['vader_sentiment_pred'])
vader_prec, vader_rec, vader_f1, _ = precision_recall_fscore_support(
    test_df['sentiment'], test_df['vader_sentiment_pred'], average='binary')
summary.append({
    'Approach': 'VADER',
    'Accuracy': vader_acc,
    'Precision': vader_prec,
    'Recall': vader_rec,
    'F1': vader_f1,
})

In [58]:
# TF-IDF + Logistic Regression
tfidf_acc = accuracy_score(test_df['labels'], test_preds)
tfidf_prec, tfidf_rec, tfidf_f1, _ = precision_recall_fscore_support(
    test_df['labels'], test_preds, average='binary')
summary.append({
    'Approach': 'TF-IDF + LogReg',
    'Accuracy': tfidf_acc,
    'Precision': tfidf_prec,
    'Recall': tfidf_rec,
    'F1': tfidf_f1,
})

In [59]:
# LSTM (own embeddings)
lstm_acc = accuracy_score(true_labels, test_pred_labels)
lstm_prec, lstm_rec, lstm_f1, _ = precision_recall_fscore_support(
    true_labels, test_pred_labels, average='binary')
summary.append({
    'Approach': 'LSTM (embeddings)',
    'Accuracy': lstm_acc,
    'Precision': lstm_prec,
    'Recall': lstm_rec,
    'F1': lstm_f1,
})

In [60]:
# LSTM (pre-trained GloVe)
lstm_glove_acc = accuracy_score(all_labels, all_preds)
lstm_glove_prec, lstm_glove_rec, lstm_glove_f1, _ = precision_recall_fscore_support(
    all_labels, all_preds, average='binary')
summary.append({
    'Approach': 'LSTM (GloVe)',
    'Accuracy': lstm_glove_acc,
    'Precision': lstm_glove_prec,
    'Recall': lstm_glove_rec,
    'F1': lstm_glove_f1,
})

In [61]:
# Pre-trained Transformer
transformer_acc = accuracy_score(test_df['labels'], test_df['transformer_pred_num'])
transformer_prec, transformer_rec, transformer_f1, _ = precision_recall_fscore_support(
    test_df['labels'], test_df['transformer_pred_num'], average='binary')
summary.append({
    'Approach': 'Transformer (pipeline)',
    'Accuracy': transformer_acc,
    'Precision': transformer_prec,
    'Recall': transformer_rec,
    'F1': transformer_f1,
})

In [62]:
# Fine-tune DistilBERT
distilbert_acc = accuracy_score(test_df['labels'], test_preds)
distilbert_prec, distilbert_rec, distilbert_f1, _ = precision_recall_fscore_support(
    test_df['labels'], test_preds, average='binary')
summary.append({
    'Approach': 'DistilBERT (fine-tune)',
    'Accuracy': distilbert_acc,
    'Precision': distilbert_prec,
    'Recall': distilbert_rec,
    'F1': distilbert_f1,
})

In [63]:
# Create summary table
summary_df = pd.DataFrame(summary)
print(summary_df)

                 Approach  Accuracy  Precision    Recall        F1
0                   VADER     0.830   0.875294  0.920792  0.897467
1         TF-IDF + LogReg     0.740   0.936306  0.727723  0.818942
2       LSTM (embeddings)     0.734   0.831296  0.841584  0.836408
3            LSTM (GloVe)     0.818   0.824017  0.985149  0.897407
4  Transformer (pipeline)     0.440   0.955882  0.321782  0.481481
5  DistilBERT (fine-tune)     0.740   0.936306  0.727723  0.818942
