In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [4]:
# Loading pre-saved data

train_data = pd.read_csv("./train.csv")
val_data = pd.read_csv("./validation.csv")
test_data = pd.read_csv("./test.csv")

In [5]:
train_data

Unnamed: 0,label,message,processed_message
0,0,I donno if they are scorable,donno scorable
1,1,HMV BONUS SPECIAL 500 pounds of genuine HMV vo...,hmv bonus special pound genuine hmv voucher an...
2,0,Full heat pa:-) i have applyed oil pa.,full heat pa applyed oil pa
3,0,Hey whats up? U sleeping all morning?,hey whats u sleeping morning
4,0,Got meh... When?,got meh
...,...,...,...
3338,0,That's a shame! Maybe cld meet for few hrs tomo?,thats shame maybe cld meet hr tomo
3339,0,K:)k:)good:)study well.,kkgoodstudy well
3340,0,How stupid to say that i challenge god.You don...,stupid say challenge godyou dont think write i...
3341,0,Its sunny in california. The weather's just cool,sunny california weather cool


In [6]:
val_data

Unnamed: 0,label,message,processed_message
0,0,Thanx a lot...,thanx lot
1,0,HEY DAS COOL... IKNOW ALL 2 WELLDA PERIL OF ST...,hey da cool iknow wellda peril studentfinancia...
2,0,Babes I think I got ur brolly I left it in Eng...,babe think got ur brolly left english wil brin...
3,0,We have sent JD for Customer Service cum Accou...,sent jd customer service cum account executive...
4,0,Ã mean it's confirmed... I tot they juz say o...,mean confirmed tot juz say oni ok
...,...,...,...
1388,0,"K I'll head out in a few mins, see you there",k ill head min see
1389,0,Want to send me a virtual hug?... I need one,want send virtual hug need one
1390,0,What i told before i tell. Stupid hear after i...,told tell stupid hear wont tell anything dad c...
1391,1,"cmon babe, make me horny, *turn* me on! Txt me...",cmon babe make horny turn txt fantasy babe im ...


In [7]:
test_data

Unnamed: 0,label,message,processed_message
0,0,"Aight I've been set free, think you could text...",aight ive set free think could text blake addr...
1,0,I have no money 4 steve mate! !,money steve mate
2,0,staff.science.nus.edu.sg/~phyhcmk/teaching/pc1323,staffsciencenusedusgphyhcmkteachingpc
3,1,URGENT! We are trying to contact U. Todays dra...,urgent trying contact u today draw show prize ...
4,0,Ã go home liao? Ask dad to pick me up at 6...,go home liao ask dad pick
...,...,...,...
831,0,No dear i was sleeping :-P,dear sleeping p
832,0,ok....take care.umma to you too...,oktake careumma
833,0,What Today-sunday..sunday is holiday..so no wo...,todaysundaysunday holidayso work
834,0,Ok... I din get ur msg...,ok din get ur msg


In [17]:
# vectorizing the data

def prepare_features(train_data, val_data, text_column='processed_message'):

    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_data[text_column].fillna('')) #filling null values in the process
    X_val = vectorizer.transform(val_data[text_column].fillna('')) #filling null values in the process
    return X_train, X_val, vectorizer

In [10]:
def train_model(model, X_train, y_train):

    model.fit(X_train, y_train)
    return model

In [11]:
def score_model(model, X):

    y_pred = model.predict(X)
    try:
        y_prob = model.predict_proba(X)[:, 1]
    except:
        y_prob = None
    return y_pred, y_prob

In [12]:
def evaluate_model(y_true, y_pred, y_prob=None):

    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    
    return {
        'classification_report': classification_report(y_true, y_pred, output_dict=True),
        'confusion_matrix': confusion_matrix(y_true, y_pred)
    }

In [13]:
def validate_model(model, train_data, val_data, text_column='processed_message', label_column='label'):

    X_train, X_val, vectorizer = prepare_features(train_data, val_data, text_column)
    y_train = train_data[label_column]
    y_val = val_data[label_column]
    
    # Train model
    model = train_model(model, X_train, y_train)
    
    # Evaluate on train set
    print("Training Set Performance:")
    y_train_pred, y_train_prob = score_model(model, X_train)
    train_metrics = evaluate_model(y_train, y_train_pred, y_train_prob)
    
    # Evaluate on validation set
    print("\nValidation Set Performance:")
    y_val_pred, y_val_prob = score_model(model, X_val)
    val_metrics = evaluate_model(y_val, y_val_pred, y_val_prob)
    
    return model, vectorizer, train_metrics, val_metrics

In [22]:
def evaluate_benchmark_models(train_data, val_data, test_data, 
                            text_column='processed_message', label_column='label'):
    """
    Train and evaluate three benchmark models using proper validation pipeline
    """
    # Initialize models
    models = {
        'Naive Bayes': MultinomialNB(),
        'Linear SVM': LinearSVC(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42)
    }
    
    results = {}
    best_f1 = 0
    best_model_name = None
    best_model = None
    best_vectorizer = None
    
    # Evaluate each model
    for name, model in models.items():
        print(f"\n{'='*50}")
        print(f"Evaluating {name}:")
        print(f"{'='*50}")
        
        # Validate model (train + validation performance)
        trained_model, vectorizer, train_metrics, val_metrics = validate_model(
            model, train_data, val_data, text_column, label_column
        )
        
        # Store validation F1 score
        val_f1 = val_metrics['classification_report']['weighted avg']['f1-score']
        
        # Track best model based on validation performance
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_name = name
            best_model = trained_model
            best_vectorizer = vectorizer
        
        results[name] = {
            'train_metrics': train_metrics,
            'val_metrics': val_metrics
        }
        
        print(f"\nValidation F1 Score: {val_f1:.4f}")
    
    print(f"\n{'='*50}")
    print(f"Best performing model on validation: {best_model_name}")
    print(f"Validation F1 Score: {best_f1:.4f}")
    
    # Evaluate best model on test set
    print(f"\n{'='*50}")
    print(f"Evaluating best model ({best_model_name}) on test set:")
    X_test = best_vectorizer.transform(test_data[text_column].fillna(''))
    y_test = test_data[label_column]
    
    y_test_pred, y_test_prob = score_model(best_model, X_test)
    test_metrics = evaluate_model(y_test, y_test_pred, y_test_prob)
    
    results['best_model_test'] = test_metrics
    
    return best_model, best_vectorizer, results

In [23]:
# Evaluate benchmark models
best_model, vectorizer, results = evaluate_benchmark_models(train_data, val_data, test_data)


Evaluating Naive Bayes:
Training Set Performance:
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2895
           1       1.00      0.81      0.89       448

    accuracy                           0.97      3343
   macro avg       0.99      0.90      0.94      3343
weighted avg       0.98      0.97      0.97      3343


Confusion Matrix:
[[2895    0]
 [  86  362]]

Validation Set Performance:
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1206
           1       1.00      0.70      0.82       187

    accuracy                           0.96      1393
   macro avg       0.98      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393


Confusion Matrix:
[[1206    0]
 [  57  130]]

Validation F1 Score: 0.9559

Evaluating Linear SVM:
Training Set Performance:
Classification Report:
              precision   

In [24]:
best_model

Evaluating the three models on validation set, we get the following results:
1. Naive Bayes has f1 score 0.9559
2. LinearSVC has f1 score 0.9824
3. Random forest has f1 score 0.9731
From the above results we can conclude that LinearSVC is the best model. Evaluating on test set, LinearSVC has f1 score of 0.97

In [14]:
# function for fine-tuning the best model

def fine_tune_model(base_model, param_grid, train_data, val_data, 
                   text_column='processed_message', label_column='label'):

    # Prepare features
    X_train, X_val, vectorizer = prepare_features(train_data, val_data, text_column)
    y_train = train_data[label_column]
    
    # Create GridSearchCV object
    grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
    
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation score:", grid_search.best_score_)
    
    return grid_search.best_estimator_, vectorizer

In [26]:
# Fine-tuning the best model

if isinstance(best_model, MultinomialNB):
    param_grid = {
        'alpha': [0.1, 0.5, 1.0, 2.0],
    }
elif isinstance(best_model, LinearSVC):
    param_grid = {
        'C': [0.1, 1, 10],
        'max_iter': [1000]
    }
elif isinstance(best_model, RandomForestClassifier):
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None]
    }

print("\nFine-tuning best model...")
fine_tuned_model, final_vectorizer = fine_tune_model(
    best_model.__class__(),  # Create new instance of same class
    param_grid,
    train_data,
    val_data
)


Fine-tuning best model...
Best parameters: {'C': 10, 'max_iter': 1000}
Best cross-validation score: 0.9111403923791819
