In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [4]:
# Load data splits
train = pd.read_csv('train.csv')
validation = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

train = train.dropna()
validation = validation.dropna()
test = test.dropna()

In [7]:
train['label'].value_counts()

ham     3472
spam     537
Name: label, dtype: int64

In [12]:
# Vectorizing the text data using TF-IDF
def vectorize_data(train_data, validation_data, test_data):
    """Vectorize the text data using TF-IDF."""
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train = vectorizer.fit_transform(train_data['message'])
    X_validation = vectorizer.transform(validation_data['message'])
    X_test = vectorizer.transform(test_data['message'])
    return X_train, X_validation, X_test, vectorizer

X_train, X_validation, X_test, vectorizer = vectorize_data(train, validation, test)

# Model training function
def fit_model(X_train, y_train, model):
    """Train a model on the training data."""
    model.fit(X_train, y_train)
    return model

# model evaluation function
def evaluate_model(model, X, y, average='binary'):
    """Evaluate the model and return precision, recall, and accuracy."""
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    recall = recall_score(y, y_pred, pos_label='spam', average=average)
    precision = precision_score(y, y_pred, pos_label='spam', average=average)
    return precision, recall, accuracy

from sklearn.model_selection import train_test_split

# Merge train and validation datasets for hyperparameter tuning
def merge_train_validation(train_data, validation_data):
    """Merge train and validation datasets for hyperparameter tuning."""
    combined_data = pd.concat([train_data, validation_data])
    return combined_data

# Fit, tune, and evaluate model with proper train-validation split
def fit_and_evaluate(train_data, validation_data, X_train, X_validation, model, param_grid):
    
    # Merge train and validation datasets
    full_train_data = merge_train_validation(train_data, validation_data)
    X_full_train = vectorizer.transform(full_train_data['message'])
    y_full_train = full_train_data['label']
    
    # Hyperparameter tuning using full train+validation data
    grid_search = GridSearchCV(model, param_grid, scoring='recall', cv=5, n_jobs=-1)
    grid_search.fit(X_full_train, y_full_train)
    best_model = grid_search.best_estimator_
    
    # Evaluate on original validation set
    val_precision, val_recall, val_accuracy = evaluate_model(best_model, X_validation, validation_data['label'])
    print(f"Validation Data - Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, Accuracy: {val_accuracy:.4f}")
    
    return best_model


# Model benchmarking with hyperparameter tuning
def benchmark_models(X_train, y_train, X_test, y_test, average='binary'):
    """Benchmark multiple models with hyperparameter tuning and display evaluation metrics."""
    models = {
        'Naive Bayes': (MultinomialNB(), {'alpha': [0.01, 0.1, 1, 10]}),
        'Logistic Regression': (LogisticRegression(), {}),
        'SVM': (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']})
    }
    
    best_model = None
    best_model_name = None
    best_recall = 0
    
    for model_name, (model, param_grid) in models.items():
        print(f"Tuning {model_name}...")
        
        grid_search = GridSearchCV(model, param_grid, scoring='recall', cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model_candidate = grid_search.best_estimator_
        
        # Evaluate on test data
        precision, recall, accuracy = evaluate_model(best_model_candidate, X_test, y_test, average=average)
        print(f"{model_name} - Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}")
        
        # Track the best model based on recall
        if recall > best_recall:
            best_recall = recall
            best_model = best_model_candidate
            best_model_name = model_name
    
    print(f"\nBest Model: {best_model_name} with Recall: {best_recall:.4f}")
    return best_model

In [28]:
# Example usage
X_train, X_validation, X_test, vectorizer = vectorize_data(train, validation, test)

# Fit and evaluate a model with hyperparameter tuning
logistic_regression_model = LogisticRegression()
param_grid = {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear']}
best_model = fit_and_evaluate(train, validation, X_train, X_validation, logistic_regression_model, param_grid)


Train Data - Precision: 0.9930, Recall: 0.7862, Accuracy: 0.9706
Validation Data - Precision: 0.9787, Recall: 0.7667, Accuracy: 0.9664


In [15]:
# Benchmark multiple models
best_model = benchmark_models(X_train, train['label'], X_test, test['label'])

Tuning Naive Bayes...
Naive Bayes - Precision: 0.9521, Recall: 0.8825, Accuracy: 0.9783

Tuning Logistic Regression...
Logistic Regression - Precision: 0.9905, Recall: 0.7702, Accuracy: 0.9690

Tuning SVM...
SVM - Precision: 0.9357, Recall: 0.5608, Accuracy: 0.9432

Best Model: Naive Bayes with Recall: 0.8825

