# 01 – Data Loading & Preprocessing

In [None]:
# 1) Installs 
!pip install langdetect nltk afinn imbalanced-learn pandas scikit-learn joblib

In [None]:
# 2) Imports & reproducibility
import os, random, json
import numpy as np, pandas as pd, torch, joblib
from langdetect import detect, DetectorFactory, LangDetectException
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from afinn import Afinn
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Download any missing NLTK data
nltk.download("vader_lexicon")
nltk.download("averaged_perceptron_tagger_eng")

# fix randomness
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
DetectorFactory.seed = RANDOM_SEED

In [None]:
# 3) Paths & meta‐feature list
BASE_DIR      = "/kaggle/working"
DATA_PATH     = "/kaggle/input/combinedverifiedreviews-sl-usa/correct_reviews_balanced.json"
TRAIN_PKL     = f"{BASE_DIR}/train_df.pkl"
VAL_PKL       = f"{BASE_DIR}/val_df.pkl"
TEST_PKL      = f"{BASE_DIR}/test_df.pkl"
VECT_PKL      = f"{BASE_DIR}/tfidf_vect.pkl"

META_FEATURES = ["num_words", "num_exclaims", "num_questions",
                 "vader_compound", "num_adjectives", "afinn_score"]

os.makedirs(BASE_DIR, exist_ok=True)

In [None]:
# 4) Load & flatten JSON
with open(DATA_PATH, "r", encoding="utf-8") as f:
    payload = json.load(f)

records = []
for shop in payload:
    for rev in shop.get("reviews", []):
        text = rev.get("text", "").strip()
        rating_str = rev.get("rating", "").split()[0]
        try:
            rating = float(rating_str)
        except ValueError:
            continue
        if text and 1 <= rating <= 5:
            records.append({
                "text": text,
                "label": int(rating) - 1,
                "source": rev.get("source", "UNK").upper()
            })

df = pd.DataFrame(records).drop_duplicates("text")
print(f"Raw rows: {len(df)}")

In [None]:
# 5) Filter non-English / too-short & compute meta-features
sia = SentimentIntensityAnalyzer()
af  = Afinn()

def compute_meta(text):
    tokens = text.split()
    return {
        "num_words":      len(tokens),
        "num_exclaims":   text.count("!"),
        "num_questions":  text.count("?"),
        "vader_compound": sia.polarity_scores(text)["compound"],
        "num_adjectives": sum(tag.startswith("JJ") for _, tag in nltk.pos_tag(tokens)),
        "afinn_score":    af.score(text)
    }

def is_english_and_long_enough(text, min_words=5):
    if not text or len(text.split()) < min_words:
        return False
    try:
        return detect(text) == "en"
    except LangDetectException:
        return False

mask = df.text.apply(is_english_and_long_enough)
df   = df[mask].reset_index(drop=True)
meta = pd.DataFrame(df.text.apply(compute_meta).tolist())
df   = pd.concat([df, meta], axis=1)
print(f"After filtering: {len(df)}")

In [None]:

trainval_df, test_df = train_test_split(
    df,
    test_size=0.20,
    stratify=df.label,
    random_state=RANDOM_SEED
)

train_df, val_df = train_test_split(
    trainval_df,
    test_size=0.125,           
    stratify=trainval_df.label,
    random_state=RANDOM_SEED
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

In [None]:
# 7) Balance ONLY the training set via SMOTE
vect = TfidfVectorizer(max_features=1000)
X_train_text = vect.fit_transform(train_df.text).toarray()
F_train      = np.hstack([X_train_text, train_df[META_FEATURES].values])

sm    = SMOTE(random_state=RANDOM_SEED)
F_res, y_res = sm.fit_resample(F_train, train_df.label)

# Find, for each synthetic sample, its nearest original index
idxs = []
for i in range(len(y_res)):
    if i < len(train_df):
        idxs.append(i)
    else:
        # map to nearest neighbor in the original train set
        dists = np.linalg.norm(F_train - F_res[i], axis=1)
        idxs.append(int(np.argmin(dists)))

train_df = train_df.iloc[idxs].reset_index(drop=True)
print(f"After SMOTE, Train: {len(train_df)} (balanced across {train_df.label.nunique()} classes)")

In [None]:
# 8) Persist splits and vectorizer
joblib.dump(train_df, TRAIN_PKL)
joblib.dump(val_df,   VAL_PKL)
joblib.dump(test_df,  TEST_PKL)
joblib.dump(vect,     VECT_PKL)

print("Saved train/val/test splits and TF–IDF vectorizer.")