In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv("SMSSpamCollection",sep = '\t',names = ['label','message'])
df = df[['label','message']]

df.columns = ['labels','text']

df['labels'] = df['labels'].map({'ham':0,'spam':1})
df.head()

x_train,x_test,y_train,y_test = train_test_split(df['text'],df['labels'],test_size=0.2,random_state=42)

vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=3000
)

x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

param_grid = {'C':[0.1,1,10],'gamma':[0.1,0.01,0.001]}

grid_search = GridSearchCV(SVC(kernel='rbf',class_weight='balanced'),param_grid,cv = 3,scoring='f1',n_jobs=-1)
grid_search.fit(x_train_vec,y_train)

from sklearn.metrics import classification_report, confusion_matrix

best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test_vec)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


import joblib

joblib.dump(best_model, "svm_spam_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.97      0.94      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

[[961   5]
 [  9 140]]


['tfidf_vectorizer.pkl']

: 