# Word2Vec Models — split first (Kaggle)

Oversample only the training set and average per-document Word2Vec embeddings. Data is loaded from Kaggle via kagglehub.


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from gensim.models import Word2Vec

In [3]:
# Install and import kagglehub
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except Exception:
    import sys, subprocess

    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "kagglehub[pandas-datasets]"]
    )
    import kagglehub
    from kagglehub import KaggleDatasetAdapter

# Load Kaggle dataset
file_path = "twitter_sentiment_data.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "edqian/twitter-climate-change-sentiment-dataset",
    file_path,
)

# Select required columns by exact name
df = df[["message", "sentiment"]]

# Drop sentiment '2' (supports both numeric 2 and string '2')
if df["sentiment"].dtype.kind in {"i", "u", "f"}:
    df = df[df["sentiment"] != 2]
else:
    df = df[df["sentiment"].astype(str) != "2"]

df.head()

  df = kagglehub.load_dataset(


Unnamed: 0,message,sentiment
0,@tiniebeany climate change is an interesting h...,-1
1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,1
2,Fabulous! Leonardo #DiCaprio's film on #climat...,1
3,RT @Mick_Fanning: Just watched this amazing do...,1
5,Unamshow awache kujinga na iko global warming ...,0


In [4]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /Users/nafis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nafis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
# Preprocess and split
def preprocess(text):
    text = re.sub("[^a-zA-Z]", " ", str(text))
    text = text.lower()
    words = text.split()
    sw = set(stopwords.words("english"))
    words = [w for w in words if w not in sw]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)


df["message"] = df["message"].apply(preprocess)
X = df["message"]
y = df["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
# Tokenize
X_train_tokens = X_train.apply(str.split).tolist()
X_test_tokens = X_test.apply(str.split).tolist()

In [7]:
# Train Word2Vec only on training tokens, then build averaged vectors (OOV-safe)
w2v_model = Word2Vec(X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
vec_size = w2v_model.vector_size
key_index = w2v_model.wv.key_to_index


def doc_avg_vector(doc):
    emb = [w2v_model.wv[w] for w in doc if w in key_index]
    return np.mean(emb, axis=0) if len(emb) else np.zeros(vec_size)


X_train_w2v = np.array([doc_avg_vector(doc) for doc in X_train_tokens])
X_test_w2v = np.array([doc_avg_vector(doc) for doc in X_test_tokens])
oversampler = RandomOverSampler(random_state=42)
X_train_res, y_train_res = oversampler.fit_resample(X_train_w2v, y_train)

## Models


In [8]:
# Logistic Regression
logreg = LogisticRegression(C=0.1, max_iter=500)
logreg.fit(X_train_res, y_train_res)
y_pred = logreg.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.26      0.42      0.32       824
           0       0.45      0.38      0.41      1538
           1       0.79      0.74      0.77      4572

    accuracy                           0.62      6934
   macro avg       0.50      0.51      0.50      6934
weighted avg       0.65      0.62      0.63      6934



In [None]:
# Random Forest
rfc = RandomForestClassifier(
    random_state=42, n_estimators=100
)
rfc.fit(X_train_res, y_train_res)
y_pred = rfc.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.48      0.16      0.24       824
           0       0.55      0.38      0.45      1538
           1       0.75      0.92      0.83      4572

    accuracy                           0.71      6934
   macro avg       0.59      0.49      0.51      6934
weighted avg       0.67      0.71      0.67      6934



In [10]:
# Linear SVM
svm = SVC(kernel="linear")
svm.fit(X_train_res, y_train_res)
y_pred = svm.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.28      0.44      0.34       824
           0       0.46      0.42      0.44      1538
           1       0.80      0.74      0.77      4572

    accuracy                           0.63      6934
   macro avg       0.51      0.53      0.52      6934
weighted avg       0.66      0.63      0.65      6934



In [11]:
# Decision Tree (avoid deprecated max_features='auto')
dt = DecisionTreeClassifier(
    max_depth=None, max_features="sqrt", min_samples_leaf=1, min_samples_split=2
)
dt.fit(X_train_res, y_train_res)
y_pred = dt.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.26      0.23      0.24       824
           0       0.40      0.38      0.39      1538
           1       0.74      0.77      0.76      4572

    accuracy                           0.62      6934
   macro avg       0.47      0.46      0.46      6934
weighted avg       0.61      0.62      0.61      6934

