In [2]:
import pandas as pd

# Load datasets
train_df = pd.read_csv("train-balanced-sarcasm.csv", low_memory=False)
test_df = pd.read_csv("test-balanced.csv", low_memory=False)

# Check shape
print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)


Train set shape: (1010826, 10)
Test set shape: (32332, 3)


In [3]:
print("Train columns:", train_df.columns.tolist())
print("Nulls in train:\n", train_df.isnull().sum())


Train columns: ['label', 'comment', 'author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment']
Nulls in train:
 label              0
comment           55
author             0
subreddit          0
score              0
ups                0
downs              0
date               0
created_utc        0
parent_comment     0
dtype: int64


In [4]:
train_df

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...
...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,..."
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...


In [5]:
# Drop rows with null comments
train_df = train_df.dropna(subset=["comment"]).reset_index(drop=True)
print("Cleaned train shape:", train_df.shape)


Cleaned train shape: (1010771, 10)


In [6]:
print(train_df['label'].value_counts())


label
0    505403
1    505368
Name: count, dtype: int64


In [11]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = str(text).lower()                                      # Lowercase
    text = re.sub(r'<.*?>', '', text)                             # Remove HTML tags
    text = re.sub(r"http\S+|www\S+", '', text)                    # Remove URLs
    text = re.sub(r"\d+", '', text)                               # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)                                  # Tokenize
    tokens = [stemmer.stem(w) for w in tokens if w not in stop_words and len(w) > 1]  # Remove stopwords and stem
    return ' '.join(tokens)

# Apply it to the dataset
train_df['clean_comment'] = train_df['comment'].apply(preprocess)


In [12]:
from collections import Counter
from spellchecker import SpellChecker

spell = SpellChecker()

def find_misspelled_words(text):
    words = text.split()
    misspelled = spell.unknown(words)
    return list(misspelled)

# Step 1: Add misspelled words column
train_df['misspelled_words'] = train_df['clean_comment'].apply(find_misspelled_words)

# Step 2: Count of misspelled words per row
train_df['misspelled_count'] = train_df['misspelled_words'].apply(len)

# Step 3: Filter rows where misspelled words exist
misspelled_rows = train_df[train_df['misspelled_count'] > 0]

# Step 4: Combine all misspelled words into one list and count frequency
all_misspelled = [word for sublist in misspelled_rows['misspelled_words'] for word in sublist]
misspelled_freq = Counter(all_misspelled)

# Step 5: Display top 20 most frequent misspellings
print("🔍 Top 20 misspelled words and their counts:\n")
for word, count in misspelled_freq.most_common(20):
    print(f"{word}: {count}")

# Optional: Preview rows with misspelled words
print("\n Sample rows with misspellings:")
print(misspelled_rows[['comment', 'clean_comment', 'misspelled_words', 'misspelled_count']].head(10))


🔍 Top 20 misspelled words and their counts:

dont: 42482
peopl: 34981
im: 32715
realli: 18879
didnt: 14846
doesnt: 13293
tri: 12393
isnt: 11580
someth: 10518
probabl: 10359
someon: 10141
pretti: 10063
everyon: 9730
theyr: 9558
everi: 9396
mayb: 9278
alway: 8774
lol: 8200
anyth: 7597
clearli: 7321

 Sample rows with misspellings:
                                              comment  \
0                                          NC and NH.   
2   They were underdogs earlier today, but since G...   
3   This meme isn't funny none of the "new york ni...   
5   I don't pay attention to her, but as long as s...   
6       Trick or treating in general is just weird...   
7                     Blade Mastery+Masamune or GTFO!   
8   You don't have to, you have a good build, buy ...   
9                   I would love to see him at lolla.   
10  I think a significant amount would be against ...   
14  Ayy bb wassup, it makes a bit more sense in co...   

                                        c

In [13]:
import emoji


# Function to extract emojis from text
def extract_emojis(text):
    emoji_list = [char for char in text if emoji.is_emoji(char)]
    return emoji_list

# Apply on original 'comment' column
train_df['emojis'] = train_df['comment'].apply(extract_emojis)

# Count number of emojis
train_df['emoji_count'] = train_df['emojis'].apply(len)

# Filter rows with emojis
emojified = train_df[train_df['emoji_count'] > 0]

# Print results
if emojified.empty:
    print("No emojis found in the dataset.")
else:
    print("Sample rows with emojis:\n")
    print(emojified[['comment', 'emojis', 'emoji_count']].head(10))


No emojis found in the dataset.


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Just to count unique words
cv = CountVectorizer()
cv.fit(train_df['clean_comment'])

vocab_size = len(cv.vocabulary_)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 152296


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=20000)
X_tfidf = vectorizer.fit_transform(train_df['clean_comment'])

print("TF-IDF matrix shape:", X_tfidf.shape)


TF-IDF matrix shape: (1010771, 20000)


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Define TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=20000)

# Split data into train/test (if not already done)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    train_df['clean_comment'], 
    train_df['label'],     # replace 'label' with your target column name
    test_size=0.2, 
    random_state=42,
    stratify=train_df['label']
)

# Fit on training and transform both train and test
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Optional: Check dimensions
print(f"TF-IDF Train Shape: {X_train_tfidf.shape}")
print(f"TF-IDF Test Shape: {X_test_tfidf.shape}")


TF-IDF Train Shape: (808616, 20000)
TF-IDF Test Shape: (202155, 20000)


In [17]:
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=20000)
X = tfidf.fit_transform(train_df['clean_comment'])

# Label column (change if yours is different)
y = train_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)

# Predict
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# Probabilities (for log loss)
y_train_prob = lr.predict_proba(X_train)
y_test_prob = lr.predict_proba(X_test)

# Average Train Metrics
train_loss = log_loss(y_train, y_train_prob)
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred, average='weighted')
train_rec = recall_score(y_train, y_train_pred, average='weighted')
train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# Average Test Metrics
test_loss = log_loss(y_test, y_test_prob)
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred, average='weighted')
test_rec = recall_score(y_test, y_test_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

# Final output format
print(f"Average Train Metrics:\nLoss: {train_loss:.4f} | Acc: {train_acc:.4f} | Prec: {train_prec:.4f} | Rec: {train_rec:.4f} | F1: {train_f1:.4f}")
print(f"\nAverage Test Metrics:\nLoss: {test_loss:.4f} | Acc: {test_acc:.4f} | Prec: {test_prec:.4f} | Rec: {test_rec:.4f} | F1: {test_f1:.4f}")


Average Train Metrics:
Loss: 0.5897 | Acc: 0.6848 | Prec: 0.6863 | Rec: 0.6848 | F1: 0.6841

Average Test Metrics:
Loss: 0.6010 | Acc: 0.6732 | Prec: 0.6748 | Rec: 0.6732 | F1: 0.6725


In [18]:
from sklearn.svm import LinearSVC

# Train SVM
svm_clf = LinearSVC()
svm_clf.fit(X_train_tfidf, y_train)

# Predict
y_train_pred = svm_clf.predict(X_train_tfidf)
y_test_pred = svm_clf.predict(X_test_tfidf)

# Probabilities (for log loss)
# LinearSVC doesn't support predict_proba, so log loss can't be directly calculated.
# Use decision_function instead and clip it to get pseudo-probabilities
import numpy as np
def safe_log_loss(y_true, y_scores):
    probs = 1 / (1 + np.exp(-y_scores))  # Sigmoid
    probs = np.vstack([1 - probs, probs]).T
    return log_loss(y_true, probs)

# Metrics
def compute_metrics(y_true, y_pred, y_scores):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    loss = safe_log_loss(y_true, y_scores)
    return loss, acc, prec, rec, f1

train_loss, train_acc, train_prec, train_rec, train_f1 = compute_metrics(y_train, y_train_pred, svm_clf.decision_function(X_train_tfidf))
test_loss, test_acc, test_prec, test_rec, test_f1 = compute_metrics(y_test, y_test_pred, svm_clf.decision_function(X_test_tfidf))

# Display results
print(f"Average Train Metrics:\nLoss: {train_loss:.4f} | Acc: {train_acc:.4f} | Prec: {train_prec:.4f} | Rec: {train_rec:.4f} | F1: {train_f1:.4f}")
print(f"\nAverage Test Metrics:\nLoss: {test_loss:.4f} | Acc: {test_acc:.4f} | Prec: {test_prec:.4f} | Rec: {test_rec:.4f} | F1: {test_f1:.4f}")


Average Train Metrics:
Loss: 0.6843 | Acc: 0.5512 | Prec: 0.5498 | Rec: 0.5649 | F1: 0.5572

Average Test Metrics:
Loss: 0.6961 | Acc: 0.4979 | Prec: 0.4982 | Rec: 0.5129 | F1: 0.5054


In [19]:
from sklearn.naive_bayes import MultinomialNB


# Train the model
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

# Predict
y_train_pred = nb.predict(X_train_tfidf)
y_test_pred = nb.predict(X_test_tfidf)

# Probabilities for log loss
y_train_proba = nb.predict_proba(X_train_tfidf)
y_test_proba = nb.predict_proba(X_test_tfidf)

# Calculate metrics
train_loss = log_loss(y_train, y_train_proba)
test_loss = log_loss(y_test, y_test_proba)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

train_prec = precision_score(y_train, y_train_pred, average='weighted')
test_prec = precision_score(y_test, y_test_pred, average='weighted')

train_rec = recall_score(y_train, y_train_pred, average='weighted')
test_rec = recall_score(y_test, y_test_pred, average='weighted')

train_f1 = f1_score(y_train, y_train_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

# Print formatted results
print(f"\nAverage Train Metrics:\nLoss: {train_loss:.4f} | Acc: {train_acc:.4f} | Prec: {train_prec:.4f} | Rec: {train_rec:.4f} | F1: {train_f1:.4f}")
print(f"\nAverage Test Metrics:\nLoss: {test_loss:.4f} | Acc: {test_acc:.4f} | Prec: {test_prec:.4f} | Rec: {test_rec:.4f} | F1: {test_f1:.4f}")



Average Train Metrics:
Loss: 0.6838 | Acc: 0.5504 | Prec: 0.5504 | Rec: 0.5504 | F1: 0.5503

Average Test Metrics:
Loss: 0.6967 | Acc: 0.4990 | Prec: 0.4990 | Rec: 0.4990 | F1: 0.4989


In [20]:
from sklearn.ensemble import RandomForestClassifier

# Initialize RF with reasonable defaults (can tune later)
rf = RandomForestClassifier(n_estimators=30, max_depth=20, random_state=42, n_jobs=-1)

# Train
rf.fit(X_train_tfidf, y_train)

# Predict
y_train_pred = rf.predict(X_train_tfidf)
y_test_pred = rf.predict(X_test_tfidf)

# Probabilities for log loss
y_train_proba = rf.predict_proba(X_train_tfidf)
y_test_proba = rf.predict_proba(X_test_tfidf)

# Calculate metrics
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score

train_loss = log_loss(y_train, y_train_proba)
test_loss = log_loss(y_test, y_test_proba)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

train_prec = precision_score(y_train, y_train_pred, average='weighted')
test_prec = precision_score(y_test, y_test_pred, average='weighted')

train_rec = recall_score(y_train, y_train_pred, average='weighted')
test_rec = recall_score(y_test, y_test_pred, average='weighted')

train_f1 = f1_score(y_train, y_train_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

# Display
print(f"\nAverage Train Metrics:\nLoss: {train_loss:.4f} | Acc: {train_acc:.4f} | Prec: {train_prec:.4f} | Rec: {train_rec:.4f} | F1: {train_f1:.4f}")
print(f"\nAverage Test Metrics:\nLoss: {test_loss:.4f} | Acc: {test_acc:.4f} | Prec: {test_prec:.4f} | Rec: {test_rec:.4f} | F1: {test_f1:.4f}")



Average Train Metrics:
Loss: 0.6927 | Acc: 0.5113 | Prec: 0.5440 | Rec: 0.5113 | F1: 0.3996

Average Test Metrics:
Loss: 0.6932 | Acc: 0.5001 | Prec: 0.5013 | Rec: 0.5001 | F1: 0.3854


In [21]:
from xgboost import XGBClassifier


# Initialize XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', verbosity=0, random_state=42)

# Fit the model
xgb_model.fit(X_train_tfidf, y_train)

# Predict
train_preds = xgb_model.predict(X_train_tfidf)
test_preds = xgb_model.predict(X_test_tfidf)

# Probabilities for log loss
train_probs = xgb_model.predict_proba(X_train_tfidf)
test_probs = xgb_model.predict_proba(X_test_tfidf)

# Evaluation metrics
def get_metrics(y_true, y_pred, y_prob):
    loss = log_loss(y_true, y_prob)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return loss, acc, prec, rec, f1

train_loss, train_acc, train_prec, train_rec, train_f1 = get_metrics(y_train, train_preds, train_probs)
test_loss, test_acc, test_prec, test_rec, test_f1 = get_metrics(y_test, test_preds, test_probs)

# Print formatted output
print("Average Train Metrics:")
print(f"Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | Prec: {train_prec:.4f} | Rec: {train_rec:.4f} | F1: {train_f1:.4f}\n")

print("Average Test Metrics:")
print(f"Loss: {test_loss:.4f} | Acc: {test_acc:.4f} | Prec: {test_prec:.4f} | Rec: {test_rec:.4f} | F1: {test_f1:.4f}")


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


Average Train Metrics:
Loss: 0.6900 | Acc: 0.5164 | Prec: 0.5867 | Rec: 0.1104 | F1: 0.1858

Average Test Metrics:
Loss: 0.6936 | Acc: 0.4999 | Prec: 0.5010 | Rec: 0.0931 | F1: 0.1571


In [22]:
from gensim.models import KeyedVectors

# Load Google News Word2Vec - 3 million words, 300 dimensions
w2v_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(w2v_path, binary=True)


In [23]:
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import torch

# 1. Tokenize the text
def tokenize(text):
    return text.split()

tokenized_texts = [tokenize(text) for text in train_df['clean_comment']]

# 2. Build vocabulary (limit to top 20,000)
vocab = build_vocab_from_iterator(tokenized_texts, specials=["<pad>", "<unk>"], max_tokens=20000)
vocab.set_default_index(vocab["<unk>"])  # For OOV tokens

# 3. Convert tokens to indices
indexed_texts = [torch.tensor(vocab(tokens)) for tokens in tokenized_texts]

# 4. Pad sequences
max_len = 100
padded_seqs = pad_sequence(indexed_texts, batch_first=True, padding_value=vocab["<pad>"])
if padded_seqs.size(1) > max_len:
    padded_seqs = padded_seqs[:, :max_len]
else:
    pad_width = max_len - padded_seqs.size(1)
    padded_seqs = torch.nn.functional.pad(padded_seqs, (0, pad_width), value=vocab["<pad>"])

# 5. Labels
labels = torch.tensor(train_df['label'].values)


In [24]:
# torchtext vocab object already built
word_index = vocab.get_stoi()  # returns: {'the': 0, 'a': 1, ...}


In [25]:
import numpy as np

embedding_dim = 300
num_words = min(20000, len(word_index))

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= 20000:
        continue
    if word in word2vec:
        embedding_matrix[i] = word2vec[word]


In [26]:

from torch.utils.data import TensorDataset, DataLoader

# already created padded_seqs and labels in previous step
# make sure they are torch tensors
X_tensor = padded_seqs.long()
y_tensor = labels.long()

# Dataset and DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)


In [27]:
import pickle

with open('padded_sequences.pkl', 'wb') as f:
    pickle.dump(padded_seqs, f)

with open('labels.npy', 'wb') as f:
    np.save(f, labels)

with open('embedding_matrix.npy', 'wb') as f:
    np.save(f, embedding_matrix)

In [28]:
import pickle

with open('word_index.pkl', 'wb') as f:
    pickle.dump(word_index, f)

