In [30]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
df_train = pd.read_csv('spam_train.csv', encoding='ISO-8859-1')
df_train.head (5)

Unnamed: 0,sms,category
0,o see if you'd decided to do anything tomo. No...,ham
1,Pls go ahead with watts. I just wanted to be s...,ham
2,"Did I forget to tell you ? I want you , I need...",ham
3,07732584351 - Rodger Burns - MSG = We tried to...,spam
4,WHO ARE YOU SEEING?,ham


In [32]:
df_test = pd.read_csv('spam_test.csv', encoding='ISO-8859-1')
df_test.head (5)

Unnamed: 0,sms,category
0,No no. I will check all rooms befor activities,ham
1,"My fri ah... Okie lor,goin 4 my drivin den go ...",ham
2,Gokila is talking with you aha:),ham
3,"Hi Shanil,Rakhesh here.thanks,i have exchanged...",ham
4,K.k.this month kotees birthday know?,ham


In [33]:
tokenizer=RegexpTokenizer('r\w+')
stopwords_english=set(stopwords.words('english'))

#Tokenizing & stemming & removing stop words
def cleanSms(sms):
 sms=sms.replace("<br /><br />"," ")
 sms=sms.lower()
 sms_tokens=tokenizer.tokenize(sms)
 sms_tokens_without_stopwords=[token for token in sms_tokens if token not in stopwords_english]
 stemmed_sms_tokens_without_stopwords=[PorterStemmer().stem(token) for token in sms_tokens_without_stopwords]
 cleaned_sms=' '.join(stemmed_sms_tokens_without_stopwords)
 return cleaned_sms

In [34]:
#Clean the data & plot it on X & Y
df_train['sms'].apply(cleanSms)
x_train = df_train['sms'].values
y_train = df_train['category'].values

df_test['sms'].apply(cleanSms)
x_test = df_test['sms'].values
y_test = df_test['category'].values

In [35]:
#Vectorze the data
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='ISO-8859-1')
vectorizer.fit(x_train)
x_train=vectorizer.transform(x_train)
x_test=vectorizer.transform(x_test)

In [36]:
#Create model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='lbfgs')
model.fit(x_train,y_train)

LogisticRegression()

In [37]:
#Predict Spam
model.predict(vectorizer.transform(["you won $900 in the new lottery draw. Call +123456789."]))

array(['spam'], dtype=object)

In [38]:
#Predict Ham
model.predict(vectorizer.transform(["Hello there. How are you doing?"]))

array(['ham'], dtype=object)

In [39]:
import joblib
joblib.dump(model,'spam_ham_model.pkl')
joblib.dump(vectorizer,'vectorizer.pkl')

['vectorizer.pkl']