In [None]:
import os
import sys
import random

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

from src.cfg import *
from src.utils_classifier import load_data_classifiers, preprocess_train_data, preprocess_test_data, get_model, evaluate_model


# Set random seed for reproducibility
seeed = SEED1 
random.seed(seeed)
np.random.seed(seeed)

In [4]:
# Optimal parameters from the grid search
C = 1
min_df = 1
ngram_range = (1, 3) 

In [None]:
# Load sample data
#pos_tweets = load_data_classifiers(TRAIN_POS_PATH)
#neg_tweets = load_data_classifiers(TRAIN_NEG_PATH)

# Load full data
pos_tweets = load_data_classifiers(TRAIN_POS_FULL_PATH)
neg_tweets = load_data_classifiers(TRAIN_NEG_FULL_PATH)

In [None]:
# Preprocess and clean the train and validation data
X_train, X_val, y_train, y_val = preprocess_train_data(
    pos_tweets, neg_tweets, show_lengths=True, show_samples=True, seed_for_split=seeed
)

In [None]:
# Preprocess test data
test_ids, cleaned_test_texts = preprocess_test_data(TEST_PATH)
print(f"Processed {len(test_ids)} test samples.")

In [None]:
# Define the SVM model pipeline with optimal parameters
print("-----------------------------------------------------------")
print("Training SVM model with optimal parameters...")

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)),
    ('classifier', LinearSVC(C= C, max_iter=10000))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate on validation set
val_f1 = evaluate_model(pipeline, X_val, y_val, metric='f1')
print(f"Validation F1 score for SVM: {val_f1:.4f}")
print("-----------------------------------------------------------\n")

# Transform test data and make predictions
vectorizer = pipeline.named_steps['vectorizer']
X_test_tfidf = vectorizer.transform(cleaned_test_texts)
classifier = pipeline.named_steps['classifier']
test_predictions = classifier.predict(X_test_tfidf)

# Save predictions
submission = pd.DataFrame({
    "Id": test_ids,
    "Prediction": test_predictions
})

In [None]:
# Save submission file
submission_file_path = f"final_submission.csv"
submission.to_csv(submission_file_path, index=False)

print("-----------------------------------------------------------")
print(f"Submission file for SVM saved to {submission_file_path}")
print("-----------------------------------------------------------")