In [None]:
import os
import sys
import random

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

from src.cfg import *
from src.utils_classifier import load_data_classifiers, preprocess_train_data, preprocess_test_data, evaluate_model


# Set random seed for reproducibility
seeed = SEED1 
random.seed(seeed)
np.random.seed(seeed)

In [2]:
# Define range of C values
C_values = [0.01, 0.1, 1, 10, 100]

# Load and preprocess data
pos_tweets = load_data_classifiers(TRAIN_POS_PATH)
neg_tweets = load_data_classifiers(TRAIN_NEG_PATH)

X_train, X_val, y_train, y_val = preprocess_train_data(
    pos_tweets, neg_tweets, show_lengths=False, show_samples=False, seed_for_split=seeed
)

In [None]:
# Store F1 scores for both models
f1_scores_svm = []
f1_scores_lr = []

for C in C_values:
    print(f"Training SVM model with C={C}...")

    # Define SVM pipeline
    svm_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(min_df=1, ngram_range=(1, 3))),
        ('classifier', LinearSVC(C=C, max_iter=10000))
    ])

    # Train the SVM model
    svm_pipeline.fit(X_train, y_train)

    # Evaluate SVM on validation set
    val_f1_svm = evaluate_model(svm_pipeline, X_val, y_val, metric='f1')
    f1_scores_svm.append(val_f1_svm)
    print(f"Validation F1 score for SVM with C={C}: {val_f1_svm:.4f}")
    
    print("-----------------------------------------")

    print(f"Training LR model with C={C}...")

    # Define LR pipeline
    lr_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(min_df=1, ngram_range=(1, 3))),
        ('classifier', LogisticRegression(C=C, max_iter=10000))
    ])

    # Train the LR model
    lr_pipeline.fit(X_train, y_train)

    # Evaluate LR on validation set
    val_f1_lr = evaluate_model(lr_pipeline, X_val, y_val, metric='f1')
    f1_scores_lr.append(val_f1_lr)
    print(f"Validation F1 score for LR with C={C}: {val_f1_lr:.4f}")

In [None]:
# Plot F1 scores vs C values for both models
plt.figure(figsize=(8, 6))
plt.plot(C_values, f1_scores_svm, marker='o', label='SVM')
plt.plot(C_values, f1_scores_lr, marker='s', label='Logistic Regression')
plt.xscale('log')
plt.xlabel('C (log scale)', fontsize=14)
plt.ylabel('Validation F1 Score', fontsize=14)
plt.legend()
plt.grid(True)

plt.savefig("f1_scores_plot.pdf" ,format='pdf') # Save it as pdf
plt.show()