In [1]:
import pandas as pd
import numpy as np
import string
import nltk
import joblib
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
df = pd.read_csv('../data/spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
model_dir = '../models'
os.makedirs(model_dir, exist_ok=True)

In [6]:
tfidf = joblib.load('../models/tfidf_vectorizer.pkl')
X_tfidf = tfidf.transform(df['message'])

In [21]:
try:
    tfidf = joblib.load(os.path.join(model_dir, 'tfidf_vectorizer.pkl'))
    print("Loaded existing tfidf_vectorizer.pkl")
except FileNotFoundError:
    print("tfidf_vectorizer.pkl not found. Creating a new one.")
    tfidf = TfidfVectorizer(stop_words='english')
    X_tfidf = tfidf.fit_transform(df['message'])
    joblib.dump(tfidf, os.path.join(model_dir, 'tfidf_vectorizer.pkl'))
    print("Saved new tfidf_vectorizer.pkl")

Loaded existing tfidf_vectorizer.pkl


In [8]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [11]:
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END .....................................C=0.1, gamma=1; total time=  52.1s
[CV] END .....................................C=0.1, gamma=1; total time=  52.5s
[CV] END .....................................C=0.1, gamma=1; total time=  52.3s
[CV] END .....................................C=0.1, gamma=1; total time=  52.1s
[CV] END .....................................C=0.1, gamma=1; total time=  51.9s
[CV] END ...................................C=0.1, gamma=0.1; total time=  36.1s
[CV] END ...................................C=0.1, gamma=0.1; total time=  36.6s
[CV] END ...................................C=0.1, gamma=0.1; total time=  36.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=  36.4s
[CV] END ...................................C=0.1, gamma=0.1; total time=  36.8s
[CV] END ..................................C=0.1, gamma=0.01; total time=  32.5s
[CV] END ..................................C=0.1,

In [22]:
X_tfidf = tfidf.transform(df['message'])

In [23]:
X = X_tfidf  # Transformed message data
y = df['label'].map({'ham': 0, 'spam': 1})  # Convert 'ham' and 'spam' to binary 0, 1

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
lr = LogisticRegression()
lr.fit(X_train, y_train)


In [26]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [27]:
svc = SVC()
svc.fit(X_train, y_train)

In [28]:
joblib.dump(lr, os.path.join(model_dir, 'spam_sms_model.pkl'))  # Save Logistic Regression model
print("Model saved as spam_sms_model.pkl")

Model saved as spam_sms_model.pkl
