# CountVectorizer Models — split first (Kaggle)

RandomOverSampler is applied only on the training set after train/test split. Data is loaded from Kaggle via kagglehub.


In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Install and import kagglehub
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except Exception:
    import sys, subprocess

    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "kagglehub[pandas-datasets]"]
    )
    import kagglehub
    from kagglehub import KaggleDatasetAdapter

# Load Kaggle dataset
file_path = "twitter_sentiment_data.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "edqian/twitter-climate-change-sentiment-dataset",
    file_path,
)

# Select required columns by exact name
df = df[["message", "sentiment"]]

# Drop sentiment '2' (supports both numeric 2 and string '2')
if df["sentiment"].dtype.kind in {"i", "u", "f"}:
    df = df[df["sentiment"] != 2]
else:
    df = df[df["sentiment"].astype(str) != "2"]

df.head()

  df = kagglehub.load_dataset(


Unnamed: 0,message,sentiment
0,@tiniebeany climate change is an interesting h...,-1
1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,1
2,Fabulous! Leonardo #DiCaprio's film on #climat...,1
3,RT @Mick_Fanning: Just watched this amazing do...,1
5,Unamshow awache kujinga na iko global warming ...,0


In [3]:
# NLTK prerequisites
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /Users/nafis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nafis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
# Preprocess and split
def preprocess(text):
    text = re.sub("[^a-zA-Z]", " ", str(text))
    text = text.lower()
    words = text.split()
    sw = set(stopwords.words("english"))
    words = [w for w in words if w not in sw]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)


df["message"] = df["message"].apply(preprocess)
X = df["message"]
y = df["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
# Vectorize (CountVectorizer)
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [6]:
# Oversample training set only
oversampler = RandomOverSampler(random_state=42)
X_train_res, y_train_res = oversampler.fit_resample(X_train_count, y_train)
X_train_res.shape, X_test_count.shape

((55170, 44806), (6934, 44806))

## Models


In [None]:
# Logistic Regression
logreg = LogisticRegression(max_iter=500, random_state=42)
logreg.fit(X_train_res, y_train_res)
y_pred = logreg.predict(X_test_count)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.62      0.61      0.62       824
           0       0.55      0.59      0.57      1538
           1       0.86      0.84      0.85      4572

    accuracy                           0.76      6934
   macro avg       0.68      0.68      0.68      6934
weighted avg       0.76      0.76      0.76      6934



In [8]:
# Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train_res, y_train_res)
y_pred = rfc.predict(X_test_count)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.75      0.43      0.55       824
           0       0.56      0.53      0.55      1538
           1       0.81      0.89      0.85      4572

    accuracy                           0.76      6934
   macro avg       0.71      0.62      0.65      6934
weighted avg       0.75      0.76      0.75      6934



In [9]:
# Linear SVM
svm = SVC(kernel="linear")
svm.fit(X_train_res, y_train_res)
y_pred = svm.predict(X_test_count)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.58      0.55      0.57       824
           0       0.54      0.53      0.53      1538
           1       0.84      0.85      0.84      4572

    accuracy                           0.74      6934
   macro avg       0.65      0.64      0.65      6934
weighted avg       0.74      0.74      0.74      6934



In [10]:
# Decision Tree (avoid deprecated max_features='auto')
dt = DecisionTreeClassifier(
    max_depth=None, max_features="sqrt", min_samples_leaf=1, min_samples_split=2
)
dt.fit(X_train_res, y_train_res)
y_pred = dt.predict(X_test_count)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.42      0.38      0.40       824
           0       0.46      0.49      0.47      1538
           1       0.79      0.78      0.79      4572

    accuracy                           0.67      6934
   macro avg       0.56      0.55      0.55      6934
weighted avg       0.67      0.67      0.67      6934



In [11]:
# Multinomial Naive Bayes (valid for non-negative sparse counts)
nb = MultinomialNB()
nb.fit(X_train_res, y_train_res)
y_pred = nb.predict(X_test_count)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.44      0.73      0.55       824
           0       0.55      0.44      0.49      1538
           1       0.85      0.81      0.83      4572

    accuracy                           0.72      6934
   macro avg       0.61      0.66      0.62      6934
weighted avg       0.74      0.72      0.72      6934

