In [None]:
import os
import sys
import random

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

from src.cfg import *
from src.data_cleaning.data_loader import *
from src.utils import preprocess_train_data, preprocess_test_data, get_model, evaluate_model

# Set random seed for reproducibility
random.seed(SEED)
np.random.seed(SEED)

In [None]:
# Load sample data
pos_tweets = load_data(TRAIN_POS_PATH)
neg_tweets = load_data(TRAIN_NEG_PATH)

# Just select subset to see if it works
pos_tweets = pos_tweets.head(20)
neg_tweets = neg_tweets.head(20)

# Load full data
#pos_tweets = load_data(TRAIN_POS_FULL_PATH)
#neg_tweets = load_data(TRAIN_NEG_FULL_PATH)

X_train, X_val, y_train, y_val = preprocess_train_data(pos_tweets, neg_tweets, show_lengths=True, show_samples=True)

In [None]:
# Preprocess test data
test_ids, cleaned_test_texts = preprocess_test_data(TEST_PATH)
print(f"Processed {len(test_ids)} test samples.")

In [None]:
## ====== NAIVE BAYES ======
SELECTED_MODEL = 'naive_bayes'

# Ensure the "submissions_classifiers" folder exists
submissions_folder = "submissions_classifiers"
os.makedirs(submissions_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Train, evaluate, and save submission for the model
print("-----------------------------------------------------------")
print(f"Running GridSearchCV for {SELECTED_MODEL}...")
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', get_model(SELECTED_MODEL))
])

param_grid = PARAM_GRID[SELECTED_MODEL]
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("-----------------------------------------------------------\n")
print("-----------------------------------------------------------")
print(f"Best parameters for {SELECTED_MODEL}: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Evaluate on validation set
val_f1 = evaluate_model(best_model, X_val, y_val, metric='f1')
print(f"Validation F1 score for {SELECTED_MODEL}: {val_f1:.4f}")
print("-----------------------------------------------------------\n")

# Transform test data using the vectorizer from the best model
vectorizer = best_model.named_steps['vectorizer']  # Extract vectorizer from pipeline
X_test_tfidf = vectorizer.transform(cleaned_test_texts)

# Make predictions
classifier = best_model.named_steps['classifier']  # Extract classifier from pipeline
test_predictions = classifier.predict(X_test_tfidf)

# Save predictions
submission = pd.DataFrame({
    "Id": test_ids,
    "Prediction": test_predictions
})

submission_file_path = os.path.join(submissions_folder, f"submission_{SELECTED_MODEL}.csv")
submission.to_csv(submission_file_path, index=False)

print("-----------------------------------------------------------")
print(f"Submission file for {SELECTED_MODEL} saved to {submission_file_path}")
print("-----------------------------------------------------------")

In [None]:
## ====== LOGISTIC REGRESSION ======
SELECTED_MODEL = 'logistic_regression'

# Ensure the "submissions_classifiers" folder exists
submissions_folder = "submissions_classifiers"
os.makedirs(submissions_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Train, evaluate, and save submission for the model
print("-----------------------------------------------------------")
print(f"Running GridSearchCV for {SELECTED_MODEL}...")
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', get_model(SELECTED_MODEL))
])

param_grid = PARAM_GRID[SELECTED_MODEL]
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("-----------------------------------------------------------\n")
print("-----------------------------------------------------------")
print(f"Best parameters for {SELECTED_MODEL}: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Evaluate on validation set
val_f1 = evaluate_model(best_model, X_val, y_val, metric='f1')
print(f"Validation F1 score for {SELECTED_MODEL}: {val_f1:.4f}")
print("-----------------------------------------------------------\n")

# Transform test data using the vectorizer from the best model
vectorizer = best_model.named_steps['vectorizer']  # Extract vectorizer from pipeline
X_test_tfidf = vectorizer.transform(cleaned_test_texts)

# Make predictions
classifier = best_model.named_steps['classifier']  # Extract classifier from pipeline
test_predictions = classifier.predict(X_test_tfidf)

# Save predictions
submission = pd.DataFrame({
    "Id": test_ids,
    "Prediction": test_predictions
})

submission_file_path = os.path.join(submissions_folder, f"submission_{SELECTED_MODEL}.csv")
submission.to_csv(submission_file_path, index=False)

print("-----------------------------------------------------------")
print(f"Submission file for {SELECTED_MODEL} saved to {submission_file_path}")
print("-----------------------------------------------------------")

In [None]:
## ====== SUPPORT VECTOR MACHINES ======
SELECTED_MODEL = 'svm'

# Ensure the "submissions_classifiers" folder exists
submissions_folder = "submissions_classifiers"
os.makedirs(submissions_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Train, evaluate, and save submission for the model
print("-----------------------------------------------------------")
print(f"Running GridSearchCV for {SELECTED_MODEL}...")
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', get_model(SELECTED_MODEL))
])

param_grid = PARAM_GRID[SELECTED_MODEL]
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("-----------------------------------------------------------\n")
print("-----------------------------------------------------------")
print(f"Best parameters for {SELECTED_MODEL}: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Evaluate on validation set
val_f1 = evaluate_model(best_model, X_val, y_val, metric='f1')
print(f"Validation F1 score for {SELECTED_MODEL}: {val_f1:.4f}")
print("-----------------------------------------------------------\n")

# Transform test data using the vectorizer from the best model
vectorizer = best_model.named_steps['vectorizer']  # Extract vectorizer from pipeline
X_test_tfidf = vectorizer.transform(cleaned_test_texts)

# Make predictions
classifier = best_model.named_steps['classifier']  # Extract classifier from pipeline
test_predictions = classifier.predict(X_test_tfidf)

# Save predictions
submission = pd.DataFrame({
    "Id": test_ids,
    "Prediction": test_predictions
})

submission_file_path = os.path.join(submissions_folder, f"submission_{SELECTED_MODEL}.csv")
submission.to_csv(submission_file_path, index=False)

print("-----------------------------------------------------------")
print(f"Submission file for {SELECTED_MODEL} saved to {submission_file_path}")
print("-----------------------------------------------------------")