In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/navin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv('datasets/train.csv')
test_df = pd.read_csv('datasets/test.csv')

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)            # URLs
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)            # Mentions
    text = re.sub(r"#[A-Za-z0-9_]+", "", text)            # Hashtags
    text = re.sub(r"[^\w\s]", "", text)                   # Punctuation
    text = re.sub(r"\d+", "", text)                       # Numbers
    text = " ".join([word for word in text.split() if word not in stop_words])  # Stopwords
    return text

train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

In [4]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = vectorizer.fit_transform(train_df['clean_text'])
X_test = vectorizer.transform(test_df['clean_text'])
y = train_df['target']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [7]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

best_model = None
best_f1 = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    f1 = f1_score(y_val, preds)
    print(f"{name} → Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = model
    print("Best Model:", best_model.__class__.__name__, "with F1 Score:", best_f1)


Logistic Regression → Accuracy: 0.8109 | F1 Score: 0.7555
Best Model: LogisticRegression with F1 Score: 0.7555178268251274


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost → Accuracy: 0.7873 | F1 Score: 0.7226
Best Model: LogisticRegression with F1 Score: 0.7555178268251274
[LightGBM] [Info] Number of positive: 2617, number of negative: 3473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7495
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 589
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.429721 -> initscore=-0.282990
[LightGBM] [Info] Start training from score -0.282990
LightGBM → Accuracy: 0.7866 | F1 Score: 0.7325
Best Model: LogisticRegression with F1 Score: 0.7555178268251274


