# TF-IDF Models — split first, then oversample

This notebook mirrors OG but applies RandomOverSampler only on the training set.


In [1]:
import pandas as pd

import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

In [2]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/anoop-vs/nlp-climate-change/refs/heads/main/main_data.csv"
)
df.head()

Unnamed: 0.1,Unnamed: 0,content,label
0,0,Researchers use deep learning to simulate chlo...,Positive
1,1,Why is our @Conservatives government so evil?\...,Negative
2,2,"Texas Oilfield Waste Company Contributed $53,7...",Negative
3,3,Epic California snowpack is now the deepest it...,Negative
4,4,If #climatechange is real and not a hoax why d...,Negative


In [3]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /Users/nafis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nafis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
def preprocess(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    words = text.split()
    sw = set(stopwords.words("english"))
    words = [w for w in words if w not in sw]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)


df["content"] = df["content"].apply(preprocess)
X = df["content"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [6]:
oversampler = RandomOverSampler(random_state=42)
X_train_res, y_train_res = oversampler.fit_resample(X_train_tfidf, y_train)
X_train_res.shape, X_test_tfidf.shape

((4416, 12100), (882, 12100))

## Models


In [7]:
# Logistic Regression
logreg = LogisticRegression(C=1000, solver="liblinear")
logreg.fit(X_train_res, y_train_res)
y_pred = logreg.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.79      0.77      0.78       154
     Neutral       0.86      0.87      0.86       378
    Positive       0.86      0.86      0.86       350

    accuracy                           0.85       882
   macro avg       0.84      0.83      0.83       882
weighted avg       0.85      0.85      0.85       882



In [8]:
# Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train_res, y_train_res)
y_pred = rfc.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.82      0.63      0.71       154
     Neutral       0.78      0.84      0.81       378
    Positive       0.81      0.83      0.82       350

    accuracy                           0.80       882
   macro avg       0.81      0.77      0.78       882
weighted avg       0.80      0.80      0.80       882



In [9]:
# Linear SVM
svm = SVC(kernel="linear")
svm.fit(X_train_res, y_train_res)
y_pred = svm.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.69      0.70      0.70       154
     Neutral       0.79      0.79      0.79       378
    Positive       0.80      0.79      0.79       350

    accuracy                           0.77       882
   macro avg       0.76      0.76      0.76       882
weighted avg       0.77      0.77      0.77       882



In [10]:
# Decision Tree (OG-style)
dt = DecisionTreeClassifier(
    max_depth=None, max_features="auto", min_samples_leaf=1, min_samples_split=2
)
dt.fit(X_train_res, y_train_res)
y_pred = dt.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.63      0.49      0.55       154
     Neutral       0.69      0.68      0.69       378
    Positive       0.65      0.72      0.68       350

    accuracy                           0.66       882
   macro avg       0.65      0.63      0.64       882
weighted avg       0.66      0.66      0.66       882





In [11]:
# Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_res, y_train_res)
y_pred = nb.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.48      0.71      0.58       154
     Neutral       0.82      0.69      0.75       378
    Positive       0.78      0.75      0.77       350

    accuracy                           0.72       882
   macro avg       0.69      0.72      0.70       882
weighted avg       0.74      0.72      0.72       882

