# Using Assignment 1 Code to Train Model

# Importing Required Libraries

In [1]:
import pandas as pd, warnings, numpy as np, joblib
from typing import Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Disable all warning messages

In [2]:
warnings.filterwarnings("ignore")

# Defining Necessary Constants

In [3]:
SEED = 8576

# Loading processed data
TRAIN_DATA = pd.read_csv("train.csv")
VALIDATION_DATA = pd.read_csv("validation.csv")
TEST_DATA = pd.read_csv("test.csv")
N_JOBS = 1
SKIP_FINE_TUNING = False

# Benchmarking multiple models
MODELS = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(random_state = SEED),
    "Support Vector Machine": SVC(random_state = SEED),
}

# Defining Necessary Functions

In [4]:
MODEL = MultinomialNB | RandomForestClassifier | SVC

# Function to fit a model on train data
def fit_model(
    train_data: pd.DataFrame,
    vectorizer: TfidfVectorizer,
    model: MODEL,
) -> Tuple[MODEL, TfidfVectorizer]:
    X_train = vectorizer.fit_transform(train_data["text"])
    y_train = train_data["spam"]

    model.fit(X_train, y_train)

    return model, vectorizer

# Function to score a model on given data
def score_model(
    model: MODEL,
    vectorizer: TfidfVectorizer,
    data: pd.DataFrame,
) -> Tuple[float, str | dict, np.ndarray]:
    X = vectorizer.transform(data["text"])
    y_true = data["spam"]

    y_pred = model.predict(X)

    accuracy = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, target_names = ["Ham", "Spam"])
    confusion_mat = confusion_matrix(y_true, y_pred)

    return accuracy, report, confusion_mat

# Function to validate the model using cross-validation
def cross_validate_model(
    model: MODEL,
    vectorizer: TfidfVectorizer,
    data: pd.DataFrame,
) -> float:
    X = vectorizer.transform(data["text"])
    y = data["spam"]

    scores = cross_val_score(model, X, y, cv = 5, scoring = "accuracy")

    return scores.mean()

# Function to score and evaluate model
def score_and_evaluate_model_and_get_accuracy(
    model: MODEL,
    vectorizer: TfidfVectorizer,
    check_data: pd.DataFrame,
    check_data_type: str,
) -> float:
    check_accuracy, check_classification_report, check_confusion_mat = score_model(model, vectorizer, check_data)
    print(f"{check_data_type} Accuracy: {round(100 * check_accuracy, 2)} %")
    print(f"{check_data_type} Classification Report:\n{check_classification_report}")
    print(f"{check_data_type} Confusion Matrix:")
    return check_accuracy

# Selecting Best Model through Accuracy 

In [5]:
best_model = None
best_model_vectorizer = None
best_accuracy = 0

# Fitting Model for Multinomial Naive Bayes

In [6]:
model, vectorizer = fit_model(TRAIN_DATA, TfidfVectorizer(), MODELS["Multinomial Naive Bayes"])

# Scoring and Evaluating on Train Data for Multinomial Naive Bayes

In [7]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TRAIN_DATA, "Train Data")

Train Data Accuracy: 93.98 %
Train Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.93      1.00      0.96      3328
        Spam       1.00      0.75      0.86      1075

    accuracy                           0.94      4403
   macro avg       0.96      0.88      0.91      4403
weighted avg       0.94      0.94      0.94      4403

Train Data Confusion Matrix:


# Scoring and Evaluating on Validation Data for Multinomial Naive Bayes

In [8]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, VALIDATION_DATA, "Validation Data")

Validation Data Accuracy: 90.55 %
Validation Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.89      1.00      0.94       412
        Spam       1.00      0.62      0.77       138

    accuracy                           0.91       550
   macro avg       0.94      0.81      0.85       550
weighted avg       0.92      0.91      0.90       550

Validation Data Confusion Matrix:


# Scoring and Evaluating on Test Data for Multinomial Naive Bayes

In [9]:
# Score on test
test_accuracy = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TEST_DATA, "Test Data")

# Select the best model based on accuracy
if test_accuracy > best_accuracy:
    best_accuracy = test_accuracy
    best_model = model
    best_model_vectorizer = vectorizer

Test Data Accuracy: 89.29 %
Test Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.87      1.00      0.93       397
        Spam       1.00      0.62      0.76       154

    accuracy                           0.89       551
   macro avg       0.94      0.81      0.85       551
weighted avg       0.91      0.89      0.88       551

Test Data Confusion Matrix:


# Fitting Model for Random Forest (If Necessary)

In [10]:
model, vectorizer = fit_model(TRAIN_DATA, TfidfVectorizer(), MODELS["Random Forest"])

# Scoring and Evaluating on Train Data for Random Forest

In [11]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TRAIN_DATA, "Train Data")

Train Data Accuracy: 100.0 %
Train Data Classification Report:
              precision    recall  f1-score   support

         Ham       1.00      1.00      1.00      3328
        Spam       1.00      1.00      1.00      1075

    accuracy                           1.00      4403
   macro avg       1.00      1.00      1.00      4403
weighted avg       1.00      1.00      1.00      4403

Train Data Confusion Matrix:


# Scoring and Evaluating on Validation Data for Random Forest

In [12]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, VALIDATION_DATA, "Validation Data")

Validation Data Accuracy: 98.18 %
Validation Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       412
        Spam       1.00      0.93      0.96       138

    accuracy                           0.98       550
   macro avg       0.99      0.96      0.98       550
weighted avg       0.98      0.98      0.98       550

Validation Data Confusion Matrix:


# Scoring and Evaluating on Test Data for Random Forest

In [13]:
# Score on test
test_accuracy = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TEST_DATA, "Test Data")

# Select the best model based on accuracy
if test_accuracy > best_accuracy:
    best_accuracy = test_accuracy
    best_model = model
    best_model_vectorizer = vectorizer

Test Data Accuracy: 96.91 %
Test Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.96      1.00      0.98       397
        Spam       0.99      0.90      0.94       154

    accuracy                           0.97       551
   macro avg       0.98      0.95      0.96       551
weighted avg       0.97      0.97      0.97       551

Test Data Confusion Matrix:


# Fitting Model for Support Vector Machine

In [14]:
model, vectorizer = fit_model(TRAIN_DATA, TfidfVectorizer(), MODELS["Support Vector Machine"])

# Scoring and Evaluating on Train Data for Support Vector Machine

In [15]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TRAIN_DATA, "Train Data")

Train Data Accuracy: 100.0 %
Train Data Classification Report:
              precision    recall  f1-score   support

         Ham       1.00      1.00      1.00      3328
        Spam       1.00      1.00      1.00      1075

    accuracy                           1.00      4403
   macro avg       1.00      1.00      1.00      4403
weighted avg       1.00      1.00      1.00      4403

Train Data Confusion Matrix:


# Scoring and Evaluating on Validation Data for Support Vector Machine

In [16]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, VALIDATION_DATA, "Validation Data")

Validation Data Accuracy: 99.09 %
Validation Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       412
        Spam       1.00      0.96      0.98       138

    accuracy                           0.99       550
   macro avg       0.99      0.98      0.99       550
weighted avg       0.99      0.99      0.99       550

Validation Data Confusion Matrix:


# Scoring and Evaluating on Test Data for Support Vector Machine

In [17]:
# Score on test
test_accuracy = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TEST_DATA, "Test Data")

# Select the best model based on accuracy
if test_accuracy > best_accuracy:
    best_accuracy = test_accuracy
    best_model = model
    best_model_vectorizer = vectorizer

Test Data Accuracy: 99.09 %
Test Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       397
        Spam       1.00      0.97      0.98       154

    accuracy                           0.99       551
   macro avg       0.99      0.98      0.99       551
weighted avg       0.99      0.99      0.99       551

Test Data Confusion Matrix:


# Saving Best Model and Vectorizer to a Pickle File

In [18]:
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(best_model_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']