In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [18]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

In [19]:
# Load the data into a Pandas dataframe
df = pd.read_csv(
    "https://raw.githubusercontent.com/anoop-vs/nlp-climate-change/refs/heads/main/main_data.csv"
)

In [20]:
# download stopwords and wordnet lemmatizer
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /Users/nafis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nafis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [21]:
def preprocess(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    words = text.split()
    stopword_list = set(stopwords.words("english"))
    words = [w for w in words if w not in stopword_list]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    text = " ".join(words)
    return text

In [22]:
df["content"] = df["content"].apply(preprocess)

In [23]:
from imblearn.over_sampling import RandomOverSampler

In [24]:
oversampler = RandomOverSampler(
    sampling_strategy={"Negative": 1842, "Positive": 1822, "Neutral": 1842}
)

In [25]:
X = df.drop("label", axis=1)
y = df["label"]

In [26]:
X_resampled, y_resampled = oversampler.fit_resample(X, y)

In [27]:
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [28]:
X = df_resampled["content"]

In [29]:
y = df_resampled["label"]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [31]:
w2v_model = Word2Vec(X, vector_size=100, window=5, min_count=1, workers=4)
# Vectorize the data using Word2Vec
X_train_w2v = np.array(
    [np.mean([w2v_model.wv[word] for word in document], axis=0) for document in X_train]
)
X_test_w2v = np.array(
    [np.mean([w2v_model.wv[word] for word in document], axis=0) for document in X_test]
)

### WORD2VEC - RANDOM FOREST


In [32]:
rclf = RandomForestClassifier()
rclf.fit(X_train_w2v, y_train)

In [33]:
y_pred = rclf.predict(X_test_w2v)

In [34]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    Negative       0.90      0.89      0.89       389
     Neutral       0.67      0.64      0.65       370
    Positive       0.63      0.67      0.65       343

    accuracy                           0.74      1102
   macro avg       0.73      0.73      0.73      1102
weighted avg       0.74      0.74      0.74      1102



### WORD2VEC - LOGISTIC REGRESSION


In [35]:
lr_model = LogisticRegression(C=0.1, solver="lbfgs")
lr_model.fit(X_train_w2v, y_train)
y_pred = lr_model.predict(X_test_w2v)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.3874773139745917


In [36]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    Negative       0.43      0.50      0.46       389
     Neutral       0.36      0.14      0.20       370
    Positive       0.36      0.52      0.42       343

    accuracy                           0.39      1102
   macro avg       0.38      0.39      0.36      1102
weighted avg       0.38      0.39      0.36      1102



WORD2VEC - DECISION TREE


In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}
dtc = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dtc, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_w2v, y_train)
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best hyperparameters:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score:  0.5974099680115572


In [38]:
dt_model = DecisionTreeClassifier(
    max_depth=None, max_features="auto", min_samples_leaf=1, min_samples_split=2
)
dt_model.fit(X_train_w2v, y_train)
y_pred = dt_model.predict(X_test_w2v)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6252268602540835




In [39]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    Negative       0.76      0.89      0.82       389
     Neutral       0.55      0.51      0.53       370
    Positive       0.50      0.45      0.47       343

    accuracy                           0.63      1102
   macro avg       0.61      0.62      0.61      1102
weighted avg       0.61      0.63      0.62      1102



### WORD2VEC - SVM


In [40]:
classifier = SVC(kernel="linear")
classifier.fit(X_train_w2v, y_train)
y_pred = classifier.predict(X_test_w2v)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.40834845735027225


In [41]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    Negative       0.46      0.50      0.48       389
     Neutral       0.38      0.16      0.22       370
    Positive       0.38      0.58      0.46       343

    accuracy                           0.41      1102
   macro avg       0.40      0.41      0.39      1102
weighted avg       0.41      0.41      0.39      1102

