# CountVectorizer Models — split first (Kaggle)

RandomOverSampler is applied only on the training set after train/test split. Data is loaded from Kaggle via kagglehub.


In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Install and import kagglehub
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except Exception:
    import sys, subprocess

    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "kagglehub[pandas-datasets]"]
    )
    import kagglehub
    from kagglehub import KaggleDatasetAdapter

# Load Kaggle dataset
file_path = "twitter_sentiment_data.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "edqian/twitter-climate-change-sentiment-dataset",
    file_path,
)

# Select required columns by exact name
df = df[["message", "sentiment"]]

# Drop sentiment '2' (supports both numeric 2 and string '2')
if df["sentiment"].dtype.kind in {"i", "u", "f"}:
    df = df[df["sentiment"] != 2]
else:
    df = df[df["sentiment"].astype(str) != "2"]

df.head()

  from .autonotebook import tqdm as notebook_tqdm
  df = kagglehub.load_dataset(


Unnamed: 0,message,sentiment
0,@tiniebeany climate change is an interesting h...,-1
1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,1
2,Fabulous! Leonardo #DiCaprio's film on #climat...,1
3,RT @Mick_Fanning: Just watched this amazing do...,1
5,Unamshow awache kujinga na iko global warming ...,0


In [3]:
# NLTK prerequisites
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /Users/nafis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nafis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
# Preprocess and split
def preprocess(text):
    text = re.sub("[^a-zA-Z]", " ", str(text))
    text = text.lower()
    words = text.split()
    sw = set(stopwords.words("english"))
    words = [w for w in words if w not in sw]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)


df["message"] = df["message"].apply(preprocess)
X = df["message"]
y = df["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
# Vectorize (CountVectorizer)
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [6]:
# Oversample training set only
oversampler = RandomOverSampler(random_state=42)
X_train_res, y_train_res = oversampler.fit_resample(X_train_count, y_train)
X_train_res.shape, X_test_count.shape

((55170, 44806), (6934, 44806))

## Models


In [7]:
# Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_res, y_train_res)
y_pred = logreg.predict(X_test_count)
print("Logistic Regression:")
print(classification_report(y_test, y_pred))

Logistic Regression:
              precision    recall  f1-score   support

          -1       0.60      0.60      0.60       824
           0       0.55      0.59      0.57      1538
           1       0.86      0.84      0.85      4572

    accuracy                           0.75      6934
   macro avg       0.67      0.68      0.67      6934
weighted avg       0.76      0.75      0.76      6934



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Random Forest
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_res, y_train_res)
y_pred = rfc.predict(X_test_count)
print("Random Forest:")
print(classification_report(y_test, y_pred))

Random Forest:
              precision    recall  f1-score   support

          -1       0.76      0.44      0.55       824
           0       0.57      0.54      0.55      1538
           1       0.81      0.89      0.85      4572

    accuracy                           0.76      6934
   macro avg       0.71      0.62      0.65      6934
weighted avg       0.75      0.76      0.75      6934



In [None]:
# Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_res, y_train_res)
y_pred = nb.predict(X_test_count)
print("Multinomial Naive Bayes:")
print(classification_report(y_test, y_pred))

Multinomial Naive Bayes:
              precision    recall  f1-score   support

          -1       0.44      0.73      0.55       824
           0       0.55      0.44      0.49      1538
           1       0.85      0.81      0.83      4572

    accuracy                           0.72      6934
   macro avg       0.61      0.66      0.62      6934
weighted avg       0.74      0.72      0.72      6934



In [12]:
# Summary of Model Performance Results
import pandas as pd

# Results from CountVectorizer Models
countvectorizer_results = {
    'Model': ['Logistic Regression', 'Random Forest', 'Multinomial NB', 'Ridge Regression'],
    'Accuracy': [0.75, 0.76, 0.72, 0.64],
    'Macro F1': [0.67, 0.65, 0.62, 0.42],
    'Weighted F1': [0.76, 0.75, 0.72, 0.63],
    'Macro Precision': [0.67, 0.71, 0.61, 0.41],
    'Macro Recall': [0.68, 0.62, 0.66, 0.48],
    'Weighted Precision': [0.76, 0.75, 0.74, 0.65],
    'Weighted Recall': [0.75, 0.76, 0.72, 0.64]
}

cv_df = pd.DataFrame(countvectorizer_results)
print("CountVectorizer Models Performance:")
print(cv_df.to_string(index=False))
print("\nBest CountVectorizer Model: Random Forest (Accuracy: 76%)")

CountVectorizer Models Performance:
              Model  Accuracy  Macro F1  Weighted F1  Macro Precision  Macro Recall  Weighted Precision  Weighted Recall
Logistic Regression      0.75      0.67         0.76             0.67          0.68                0.76             0.75
      Random Forest      0.76      0.65         0.75             0.71          0.62                0.75             0.76
     Multinomial NB      0.72      0.62         0.72             0.61          0.66                0.74             0.72
   Ridge Regression      0.64      0.42         0.63             0.41          0.48                0.65             0.64

Best CountVectorizer Model: Random Forest (Accuracy: 76%)


In [13]:
# COMPREHENSIVE MODEL PERFORMANCE COMPARISON
import pandas as pd

print("="*80)
print("BEST MODELS PERFORMANCE COMPARISON ACROSS ALL THREE APPROACHES")
print("="*80)

# Create comprehensive comparison of best models from each approach
comparison_data = {
    'Feature Extraction Method': ['CountVectorizer', 'TF-IDF', 'Word2Vec'],
    'Best Model': ['Random Forest', 'Random Forest', 'TBD'],
    'Accuracy': [0.76, 0.76, 'TBD'],
    'Macro F1-Score': [0.65, 0.65, 'TBD'],
    'Weighted F1-Score': [0.75, 0.74, 'TBD'],
    'Macro Precision': [0.71, 0.72, 'TBD'],
    'Macro Recall': [0.62, 0.61, 'TBD'],
    'Weighted Precision': [0.75, 0.75, 'TBD'],
    'Weighted Recall': [0.76, 0.76, 'TBD']
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

print("\n" + "="*80)
print("DETAILED PERFORMANCE BY FEATURE EXTRACTION METHOD")
print("="*80)

print("\n1. CountVectorizer Models:")
print("-" * 40)
cv_results = {
    'Model': ['Logistic Regression', 'Random Forest', 'Multinomial NB', 'Ridge Regression'],
    'Accuracy': [0.75, 0.76, 0.72, 0.64],
    'F1 (Macro)': [0.67, 0.65, 0.62, 0.42],
    'F1 (Weighted)': [0.76, 0.75, 0.72, 0.63],
    'Precision (Macro)': [0.67, 0.71, 0.61, 0.41],
    'Recall (Macro)': [0.68, 0.62, 0.66, 0.48]
}
cv_df = pd.DataFrame(cv_results)
print(cv_df.to_string(index=False))

print("\n2. TF-IDF Models:")
print("-" * 40)
tfidf_results = {
    'Model': ['Logistic Regression', 'Random Forest', 'Multinomial NB', 'Ridge Regression'],
    'Accuracy': [0.75, 0.76, 0.72, 0.63],
    'F1 (Macro)': [0.67, 0.65, 0.63, 0.42],
    'F1 (Weighted)': [0.75, 0.74, 0.72, 0.62],
    'Precision (Macro)': [0.66, 0.72, 0.62, 0.41],
    'Recall (Macro)': [0.68, 0.61, 0.67, 0.49]
}
tfidf_df = pd.DataFrame(tfidf_results)
print(tfidf_df.to_string(index=False))

print("\n" + "="*80)
print("KEY FINDINGS:")
print("="*80)
print("• Best Overall Accuracy: 76% (Random Forest with both CountVectorizer and TF-IDF)")
print("• Best F1-Score (Macro): 67% (Logistic Regression with both methods)")
print("• Best F1-Score (Weighted): 76% (Logistic Regression with CountVectorizer)")
print("• Most Consistent Performer: Random Forest (consistently high accuracy)")
print("• CountVectorizer vs TF-IDF: Very similar performance across all models")
print("• Ridge Regression: Poorest performer across all metrics")
print("="*80)

BEST MODELS PERFORMANCE COMPARISON ACROSS ALL THREE APPROACHES
Feature Extraction Method    Best Model Accuracy Macro F1-Score Weighted F1-Score Macro Precision Macro Recall Weighted Precision Weighted Recall
          CountVectorizer Random Forest     0.76           0.65              0.75            0.71         0.62               0.75            0.76
                   TF-IDF Random Forest     0.76           0.65              0.74            0.72         0.61               0.75            0.76
                 Word2Vec           TBD      TBD            TBD               TBD             TBD          TBD                TBD             TBD

DETAILED PERFORMANCE BY FEATURE EXTRACTION METHOD

1. CountVectorizer Models:
----------------------------------------
              Model  Accuracy  F1 (Macro)  F1 (Weighted)  Precision (Macro)  Recall (Macro)
Logistic Regression      0.75        0.67           0.76               0.67            0.68
      Random Forest      0.76        0.65        