Import required libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import xgboost as xgb
import joblib

Save csv file as dataframe

In [3]:
dataframe = pd.read_csv('spam.csv')

Initializing TfidfVectorizer to vectorize text string and assign relevance to words

In [4]:
vectorizer = TfidfVectorizer(max_features=2000)

Fitting vectorizer to string; Dataframe split into X and y, X-> input feature, y-> output 

In [5]:
X = vectorizer.fit_transform(dataframe['query']) 
y = dataframe['label']

Splitting dataset into 70% train and 30% test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Initializing XGBClassifier

In [7]:
model = xgb.XGBClassifier(random_state = 42, use_label_encoder = False, eval_metric = 'logloss')

Setting parameters for GridSearchCV 

In [12]:
param_grid = {
    'n_estimators': [50, 100, 200], 
    'learning_rate': [0.01, 0.05, 0.1], 
    'max_depth': [3, 5, 7],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0],  
    'gamma': [0, 0.1, 0.2],  
}

GridSearchCV used to find best hyperparameters for the model

In [14]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

Fit to training set

In [15]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters found:

In [16]:
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Best parameters found:  {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}


Set best estimators and predict

In [17]:
best_model = grid_search.best_estimator_

In [18]:
y_pred = best_model.predict(X_test)

Model accuracy with tuned hyperparameters

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 96.23%


XGBClassifier without hyperparameter tuning:

In [8]:
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



Prediction:

In [9]:
y_pred = model.predict(X_test)

Model accuracy score:

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 96.74%


Choosing XGBClassifier without hyperparameter tuning for higher accuracy.

Saving model and vectorizer as pickle files:

In [11]:
joblib.dump(model, 'spam_classifier.pkl')

['spam_classifier.pkl']

In [12]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']