In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.preprocessing import ReviewPreprocessor
from src.features import EmbeddingVectorizer
from src.models import MultiLabelModel

# --- 1. CONFIGURATION ---
DATA_PATH = '../data/raw/data_amazon.csv'
W2V_PATH = '../data/embeddings/GoogleNews-vectors-negative300.bin'
GLOVE_PATH = '../data/embeddings/glove.6B.300d.txt'
TARGET_LABELS =

# --- 2. DATA LOADING ---
print("Loading Data...")
df = pd.read_csv(DATA_PATH)

# Handling missing Titles 
df = df.fillna('')
df = df.fillna('')
df['full_text'] = df + " " + df

# Ensure targets are binary 
df = df.fillna(0).astype(int)

# --- 3. PREPROCESSING ---
print("Preprocessing Text...")
preprocessor = ReviewPreprocessor()
df['tokens'] = df['full_text'].apply(preprocessor.clean_text)

# Split Data
X_tokens = df['tokens'].tolist()
y = df.values
X_train_tok, X_test_tok, y_train, y_test = train_test_split(
    X_tokens, y, test_size=0.2, random_state=42
)

# --- 4. EXPERIMENT A: WORD2VEC ---
print("\n--- Starting Word2Vec Experiment ---")
w2v_vectorizer = EmbeddingVectorizer(method='word2vec', path=W2V_PATH)
X_train_w2v = w2v_vectorizer.transform(X_train_tok)
X_test_w2v = w2v_vectorizer.transform(X_test_tok)

model_w2v = MultiLabelModel(algorithm='svm')
model_w2v.train(X_train_w2v, y_train)
y_pred_w2v = model_w2v.predict(X_test_w2v)
metrics_w2v = model_w2v.evaluate(y_test, y_pred_w2v)

# --- 5. EXPERIMENT B: GLOVE ---
print("\n--- Starting GloVe Experiment ---")
glove_vectorizer = EmbeddingVectorizer(method='glove', path=GLOVE_PATH)
X_train_glove = glove_vectorizer.transform(X_train_tok)
X_test_glove = glove_vectorizer.transform(X_test_tok)

model_glove = MultiLabelModel(algorithm='svm')
model_glove.train(X_train_glove, y_train)
y_pred_glove = model_glove.predict(X_test_glove)
metrics_glove = model_glove.evaluate(y_test, y_pred_glove)

# --- 6. RESULTS COMPARISON ---
results = pd.DataFrame([metrics_w2v, metrics_glove], index=)
print("\nComparative Results:")
print(results)