# Word2Vec Models â€” split first (Kaggle)

Oversample only the training set and average per-document Word2Vec embeddings. Data is loaded from Kaggle via kagglehub.


In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from gensim.models import Word2Vec

In [11]:
# Install and import kagglehub
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except Exception:
    import sys, subprocess

    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "kagglehub[pandas-datasets]"]
    )
    import kagglehub
    from kagglehub import KaggleDatasetAdapter

# Load Kaggle dataset
file_path = "twitter_sentiment_data.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "edqian/twitter-climate-change-sentiment-dataset",
    file_path,
)

# Select required columns by exact name
df = df[["message", "sentiment"]]

# Drop sentiment '2' (supports both numeric 2 and string '2')
if df["sentiment"].dtype.kind in {"i", "u", "f"}:
    df = df[df["sentiment"] != 2]
else:
    df = df[df["sentiment"].astype(str) != "2"]

df.head()

  df = kagglehub.load_dataset(


Unnamed: 0,message,sentiment
0,@tiniebeany climate change is an interesting h...,-1
1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,1
2,Fabulous! Leonardo #DiCaprio's film on #climat...,1
3,RT @Mick_Fanning: Just watched this amazing do...,1
5,Unamshow awache kujinga na iko global warming ...,0


In [12]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /Users/nafis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nafis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
# Preprocess and split
def preprocess(text):
    text = re.sub("[^a-zA-Z]", " ", str(text))
    text = text.lower()
    words = text.split()
    sw = set(stopwords.words("english"))
    words = [w for w in words if w not in sw]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)


df["message"] = df["message"].apply(preprocess)
X = df["message"]
y = df["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
# Tokenize
X_train_tokens = X_train.apply(str.split).tolist()
X_test_tokens = X_test.apply(str.split).tolist()

In [15]:
# Train Word2Vec only on training tokens, then build averaged vectors (OOV-safe)
w2v_model = Word2Vec(X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
vec_size = w2v_model.vector_size
key_index = w2v_model.wv.key_to_index


def doc_avg_vector(doc):
    emb = [w2v_model.wv[w] for w in doc if w in key_index]
    return np.mean(emb, axis=0) if len(emb) else np.zeros(vec_size)


X_train_w2v = np.array([doc_avg_vector(doc) for doc in X_train_tokens])
X_test_w2v = np.array([doc_avg_vector(doc) for doc in X_test_tokens])
oversampler = RandomOverSampler(random_state=42)
X_train_res, y_train_res = oversampler.fit_resample(X_train_w2v, y_train)

## Models


In [19]:
# Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_res, y_train_res)
y_pred = logreg.predict(X_test_w2v)
print("Logistic Regression:")
print(classification_report(y_test, y_pred))

Logistic Regression:
              precision    recall  f1-score   support

          -1       0.26      0.41      0.32       824
           0       0.45      0.41      0.43      1538
           1       0.79      0.74      0.77      4572

    accuracy                           0.62      6934
   macro avg       0.50      0.52      0.50      6934
weighted avg       0.65      0.62      0.64      6934



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Random Forest
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_res, y_train_res)
y_pred = rfc.predict(X_test_w2v)
print("Random Forest:")
print(classification_report(y_test, y_pred))

Random Forest:
              precision    recall  f1-score   support

          -1       0.51      0.19      0.27       824
           0       0.55      0.38      0.45      1538
           1       0.75      0.91      0.82      4572

    accuracy                           0.71      6934
   macro avg       0.60      0.49      0.52      6934
weighted avg       0.68      0.71      0.68      6934



In [18]:
# Multinomial Naive Bayes
# Note: Multinomial NB requires non-negative features
# We need to shift the Word2Vec embeddings to be non-negative
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test_w2v)

nb = MultinomialNB()
nb.fit(X_train_scaled, y_train_res)
y_pred = nb.predict(X_test_scaled)
print("Multinomial Naive Bayes:")
print(classification_report(y_test, y_pred))

Multinomial Naive Bayes:
              precision    recall  f1-score   support

          -1       0.20      0.39      0.27       824
           0       0.32      0.16      0.21      1538
           1       0.76      0.77      0.76      4572

    accuracy                           0.59      6934
   macro avg       0.43      0.44      0.41      6934
weighted avg       0.59      0.59      0.58      6934

