In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [13]:
df_spam = pd.read_csv('SMSSpamCollection.csv', header=None)
df_spam.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,ham,Go until jurong point,crazy.. Available only in bugis n great world...,,,,,,,,...,,,,,,,,,,
1,ham,Ok lar... Joking wif u oni...,,,,,,,,,...,,,,,,,,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,,,,,,...,,,,,,,,,,
3,ham,U dun say so early hor... U c already then say...,,,,,,,,,...,,,,,,,,,,
4,ham,Nah I don't think he goes to usf,he lives around here though,,,,,,,,...,,,,,,,,,,


In [4]:
df_spam['messages'] = df_spam.iloc[:, 1:].apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)

In [5]:
df = df_spam[['messages', 0]]
df.columns = ['messages', 'result']

In [6]:
df.head()

Unnamed: 0,messages,result
0,Go until jurong point crazy.. Available only ...,ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,Nah I don't think he goes to usf he lives aro...,ham


In [7]:
X = df['messages']
y = df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42)
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)
X_train_CV = vectorizer.transform(X_train)
X_test_CV = vectorizer.transform(X_test)

X_test_CV

<1384x7515 sparse matrix of type '<class 'numpy.float64'>'
	with 17223 stored elements in Compressed Sparse Row format>

In [8]:
# LR

model = LogisticRegression(max_iter=150).fit(X_train_CV, y_train)
print(model.score(X_train_CV, y_train))
print(model.score(X_test_CV, y_test))

0.9749397590361446
0.9710982658959537


In [9]:
# Perceptron

ppn = Perceptron(random_state=0)
ppn.fit(X_train_CV, y_train)
y_pred = ppn.predict(X_test_CV)
print('Perceptron accuracy: %.5f' % accuracy_score(y_test, y_pred))


Perceptron accuracy: 0.98338


In [10]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train_CV.toarray(), y_train)
y_pred = nb.predict(X_test_CV.toarray())
print('Naive Bayes accuracy: %.5f' % accuracy_score(y_test, y_pred))

Naive Bayes accuracy: 0.90318
