In [None]:
import pandas as pd
import numpy as np
import os

csv_path = os.path.join(
    "..",
    "data",
    "raw",
    "Juggernaut Sentiment Analysis - by kaggle user Adeoluwa Adeboye.csv"
)

data = pd.read_csv(csv_path, on_bad_lines='skip')
data

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...
...,...,...,...,...
1578607,1578623,1,Sentiment140,Zzzzzz.... Finally! Night tweeters!
1578608,1578624,1,Sentiment140,"Zzzzzzz, sleep well people"
1578609,1578625,0,Sentiment140,ZzzZzZzzzZ... wait no I have homework.
1578610,1578626,0,Sentiment140,"ZzZzzzZZZZzzz meh, what am I doing up again?"


In [4]:
df = data[['SentimentText', 'Sentiment']]
df.columns = ['text', 'label']
df

Unnamed: 0,text,label
0,is so sad for my APL frie...,0
1,I missed the New Moon trail...,0
2,omg its already 7:30 :O,1
3,.. Omgaga. Im sooo im gunna CRy. I'...,0
4,i think mi bf is cheating on me!!! ...,0
...,...,...
1578607,Zzzzzz.... Finally! Night tweeters!,1
1578608,"Zzzzzzz, sleep well people",1
1578609,ZzzZzZzzzZ... wait no I have homework.,0
1578610,"ZzZzzzZZZZzzz meh, what am I doing up again?",0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578612 entries, 0 to 1578611
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1578612 non-null  object
 1   label   1578612 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 24.1+ MB


In [6]:
df.label.value_counts()

label
1    790177
0    788435
Name: count, dtype: int64

## Preprocessing

In [9]:
import os
import time
import json
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import product
from scipy.sparse import save_npz

In [10]:
base_dir = os.path.join("..", "data", "processed")
os.makedirs(base_dir, exist_ok=True)

# ===== Configurations =====
split_params_list = [
    {"test_size": 0.15, "random_state": 98}
]

vectorizer_params_list = [
    {"max_features": 10000,  "ngram_range": (1, 2)}, 
    {"max_features": 30000,  "ngram_range": (1, 3)}, 
    {"max_features": 80000,  "ngram_range": (1, 5)}, 
]

# ===== Loop through all combinations =====
for idx, (split_cfg, vec_cfg) in enumerate(product(split_params_list, vectorizer_params_list), start=1):

    # Directory name for this configuration
    config_name = f"dataset_{idx:03d}"
    output_dir = os.path.join(base_dir, config_name)
    
    if os.path.exists(output_dir):
        print(f"Skipping {config_name} because output directory already exists.")
        continue
    
    os.makedirs(output_dir, exist_ok=True)

    print(f"\n=== Processing {config_name} ===")
    print(f"Split params: {split_cfg}")
    print(f"Vectorizer params: {vec_cfg}")

    # ===== Train/Test Split =====
    X_train, X_test, y_train, y_test = train_test_split(
        df["text"], df["label"],
        test_size=split_cfg["test_size"],
        random_state=split_cfg["random_state"],
        stratify=df["label"]
    )

    # ===== Vectorization =====
    t0 = time.time()
    vectorizer = TfidfVectorizer(**vec_cfg)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    elapsed = time.time() - t0
    print(f"Vectorization complete in {elapsed:.2f} seconds.")

    # ===== Save datasets =====
    save_npz(os.path.join(output_dir, "X_train_tfidf.npz"), X_train_tfidf)
    save_npz(os.path.join(output_dir, "X_test_tfidf.npz"), X_test_tfidf)
    joblib.dump(y_train, os.path.join(output_dir, "y_train.pkl"))
    joblib.dump(y_test, os.path.join(output_dir, "y_test.pkl"))
    joblib.dump(vectorizer, os.path.join(output_dir, "vectorizer.pkl"))

    # ===== Save config as JSON =====
    config_data = {
        "split_params": split_cfg,
        "vectorizer_params": vec_cfg,
        "vectorization_time_sec": elapsed,
        "n_train_samples": len(X_train),
        "n_test_samples": len(X_test),
        "vocab_size": len(vectorizer.vocabulary_)
    }
    with open(os.path.join(output_dir, "config.json"), "w") as f:
        json.dump(config_data, f, indent=4)

    print(f"Saved processed dataset and config to {output_dir}")


=== Processing dataset_001 ===
Split params: {'test_size': 0.15, 'random_state': 98}
Vectorizer params: {'max_features': 10000, 'ngram_range': (1, 2)}
Vectorization complete in 35.21 seconds.
Saved processed dataset and config to ../data/processed/dataset_001

=== Processing dataset_002 ===
Split params: {'test_size': 0.15, 'random_state': 98}
Vectorizer params: {'max_features': 30000, 'ngram_range': (1, 3)}
Vectorization complete in 84.99 seconds.
Saved processed dataset and config to ../data/processed/dataset_002

=== Processing dataset_003 ===
Split params: {'test_size': 0.15, 'random_state': 98}
Vectorizer params: {'max_features': 80000, 'ngram_range': (1, 5)}
Vectorization complete in 370.68 seconds.
Saved processed dataset and config to ../data/processed/dataset_003
