In [1]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

In [3]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/anoop-vs/nlp-climate-change/refs/heads/main/main_data.csv"
)

In [4]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /Users/nafis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nafis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
def preprocess(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    words = text.split()
    stopword_list = set(stopwords.words("english"))
    words = [w for w in words if w not in stopword_list]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    text = " ".join(words)
    return text

In [6]:
df["content"] = df["content"].apply(preprocess)

In [None]:
X = df["content"]
y = df["label"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from imblearn.over_sampling import RandomOverSampler

# OG-style: Oversample BEFORE splitting with fixed class counts
oversampler = RandomOverSampler(
    sampling_strategy={"Negative": 1842, "Positive": 1822, "Neutral": 1842}
)

# Ensure X is a single-column DataFrame named 'content'
if isinstance(X, pd.Series):
    X_df = X.to_frame(name="content")
else:
    # If X is already a DataFrame, use the 'content' column if it exists, otherwise take the first column
    if "content" in X.columns:
        X_df = X[["content"]]
    else:
        X_df = X.iloc[:, [0]].copy()
        X_df.columns = ["content"]

X_resampled, y_resampled = oversampler.fit_resample(X_df, y)

# Recreate dataframe and reset X, y to resampled
X = X_resampled["content"]
y = y_resampled

# Train a single Word2Vec model on the full resampled corpus (OG behavior)
X_corpus_tokens = X.apply(str.split).tolist()
w2v_model = Word2Vec(X_corpus_tokens, vector_size=100, window=5, min_count=1, workers=4)

### TFIDF-WORD2VEC - LOGISTIC REGRESSION


In [14]:
tfidf_vectorizer = TfidfVectorizer()

In [23]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Vectorize the data using Word2Vec (reuse pretrained on full resampled corpus)
X_train_tokens = X_train.apply(str.split).tolist()
X_test_tokens = X_test.apply(str.split).tolist()
X_train_w2v = np.array(
    [
        np.mean([w2v_model.wv[word] for word in document], axis=0)
        for document in X_train_tokens
    ]
)
X_test_w2v = np.array(
    [
        np.mean([w2v_model.wv[word] for word in document], axis=0)
        for document in X_test_tokens
    ]
)

In [26]:
X_train_combined = hstack([X_train_tfidf, X_train_w2v])
X_test_combined = hstack([X_test_tfidf, X_test_w2v])

In [27]:
lrtw = LogisticRegression()
lrtw.fit(X_train_combined, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
y_pred = lrtw.predict(X_test_combined)

In [29]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    Negative       0.83      0.92      0.87       389
     Neutral       0.77      0.66      0.71       370
    Positive       0.74      0.76      0.75       343

    accuracy                           0.78      1102
   macro avg       0.78      0.78      0.78      1102
weighted avg       0.78      0.78      0.78      1102



### TFIDF-WORD2VEC - RANDOM FOREST


In [30]:
rfctw = RandomForestClassifier()
rfctw.fit(X_train_combined, y_train)

In [31]:
y_pred = rfctw.predict(X_test_combined)

In [32]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    Negative       0.85      0.88      0.87       389
     Neutral       0.60      0.52      0.55       370
    Positive       0.58      0.64      0.61       343

    accuracy                           0.68      1102
   macro avg       0.68      0.68      0.68      1102
weighted avg       0.68      0.68      0.68      1102



### TFIDF-WORD2VEC - SVM


In [33]:
svmtW = SVC(C=10, gamma=10, kernel="linear")
svmtW.fit(X_train_combined, y_train)
y_pred = svmtW.predict(X_test_combined)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    Negative       0.91      0.98      0.95       389
     Neutral       0.92      0.85      0.88       370
    Positive       0.89      0.88      0.88       343

    accuracy                           0.91      1102
   macro avg       0.91      0.90      0.90      1102
weighted avg       0.91      0.91      0.91      1102



### TFIDF-WORD2VEC - DECISION TREE


In [None]:
dttW = DecisionTreeClassifier(
    max_depth=None, max_features="auto", min_samples_leaf=1, min_samples_split=2
)
dttW.fit(X_train_combined, y_train)
y_pred = dttW.predict(X_test_combined)
report = classification_report(y_test, y_pred)
print(report)

### TFIDF-WORD2VEC - NAIVE BAYES


In [None]:
nbtW = MultinomialNB()
# MultinomialNB requires non-negative features; W2V introduces negatives
if hasattr(X_train_combined, "data") and X_train_combined.data.min() < 0:
    print(
        "Skipping MultinomialNB: negative values present in combined features (due to Word2Vec). Not applicable."
    )
else:
    nbtW.fit(X_train_combined, y_train)
    y_pred = nbtW.predict(X_test_combined)
    report = classification_report(y_test, y_pred)
    print(report)

Skipping MultinomialNB: negative values present in combined features (due to Word2Vec). Not applicable.
