# Word2Vec Models — split first, then oversample

This notebook mirrors OG but oversamples only the training set and averages per-document Word2Vec embeddings.


In [9]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from gensim.models import Word2Vec

In [10]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/anoop-vs/nlp-climate-change/refs/heads/main/main_data.csv"
)
df.head()

Unnamed: 0.1,Unnamed: 0,content,label
0,0,Researchers use deep learning to simulate chlo...,Positive
1,1,Why is our @Conservatives government so evil?\...,Negative
2,2,"Texas Oilfield Waste Company Contributed $53,7...",Negative
3,3,Epic California snowpack is now the deepest it...,Negative
4,4,If #climatechange is real and not a hoax why d...,Negative


In [11]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /Users/nafis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nafis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
def preprocess(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    words = text.split()
    sw = set(stopwords.words("english"))
    words = [w for w in words if w not in sw]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)


df["content"] = df["content"].apply(preprocess)
X = df["content"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
# Tokenize
X_train_tokens = X_train.apply(str.split).tolist()
X_test_tokens = X_test.apply(str.split).tolist()

In [14]:
# Oversample training set only (use averaged vectors)
# Train a W2V model on the training tokens to avoid leakage; then build per-doc vectors, then oversample.
w2v_model = Word2Vec(X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Helper to compute an averaged embedding for a tokenized document, skipping OOV tokens
vec_size = w2v_model.vector_size
key_index = w2v_model.wv.key_to_index


def doc_avg_vector(doc):
    emb = [w2v_model.wv[w] for w in doc if w in key_index]
    return np.mean(emb, axis=0) if len(emb) else np.zeros(vec_size)


X_train_w2v = np.array([doc_avg_vector(doc) for doc in X_train_tokens])
X_test_w2v = np.array([doc_avg_vector(doc) for doc in X_test_tokens])

oversampler = RandomOverSampler(random_state=42)
X_train_res, y_train_res = oversampler.fit_resample(X_train_w2v, y_train)

## Models


In [15]:
# Logistic Regression
logreg = LogisticRegression(C=0.1)
logreg.fit(X_train_res, y_train_res)
y_pred = logreg.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.25      0.35      0.29       154
     Neutral       0.50      0.29      0.37       378
    Positive       0.44      0.56      0.49       350

    accuracy                           0.41       882
   macro avg       0.40      0.40      0.38       882
weighted avg       0.43      0.41      0.40       882



In [16]:
# Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train_res, y_train_res)
y_pred = rfc.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.36      0.23      0.28       154
     Neutral       0.50      0.48      0.49       378
    Positive       0.48      0.58      0.52       350

    accuracy                           0.47       882
   macro avg       0.45      0.43      0.43       882
weighted avg       0.47      0.47      0.47       882



In [17]:
# Linear SVM
svm = SVC(kernel="linear")
svm.fit(X_train_res, y_train_res)
y_pred = svm.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.35      0.25      0.29       154
     Neutral       0.50      0.41      0.45       378
    Positive       0.45      0.60      0.52       350

    accuracy                           0.46       882
   macro avg       0.44      0.42      0.42       882
weighted avg       0.46      0.46      0.45       882



In [18]:
# Decision Tree (OG-style)
dt = DecisionTreeClassifier(
    max_depth=None, max_features="auto", min_samples_leaf=1, min_samples_split=2
)
dt.fit(X_train_res, y_train_res)
y_pred = dt.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.23      0.19      0.21       154
     Neutral       0.46      0.47      0.46       378
    Positive       0.43      0.45      0.44       350

    accuracy                           0.41       882
   macro avg       0.37      0.37      0.37       882
weighted avg       0.41      0.41      0.41       882



