In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/spam.tsv', sep='\t')
data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [3]:
data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [4]:
data.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [5]:
data.shape

(5572, 4)

# Dataset Balancing

In [6]:
ham = data[data['label'] == 'ham']
ham.shape

(4825, 4)

In [7]:
spam = data[data['label'] == 'spam']
spam.shape

(747, 4)

In [8]:
ham = ham.sample(spam.shape[0])
ham.shape

(747, 4)

In [9]:
data = pd.concat([ham, spam], axis = 0, ignore_index = True)
data.shape

(1494, 4)

In [10]:
data.sample(10)

Unnamed: 0,label,message,length,punct
363,ham,Lol I know! Hey someone did a great inpersonat...,83,3
1453,spam,Natalja (25/F) is inviting you to be her frien...,136,12
858,spam,Urgent Ur £500 guaranteed award is still uncla...,159,5
116,ham,K..k:)where are you?how did you performed?,42,6
446,ham,"What are you doing in langport? Sorry, but I'l...",133,6
1090,spam,Urgent! call 09066612661 from landline. Your c...,167,7
1080,spam,YES! The only place in town to meet exciting a...,109,5
730,ham,Have you bookedthe hut? And also your time off...,71,3
207,ham,"Tick, tick, tick .... Where are you ? I could ...",111,15
638,ham,Your account has been refilled successfully by...,174,16


# Data Preparation for Training

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], 
                                                    test_size = 0.2, random_state = 0, 
                                                    shuffle = True, stratify = data['label'])


In [14]:
clf = Pipeline([('tfidf', TfidfVectorizer()),
               ('rfc', RandomForestClassifier(n_estimators = 100, n_jobs = -1))])

clf.fit(X_train, y_train)

In [15]:
y_pred = clf.predict(X_test)

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.90      1.00      0.95       150
        spam       1.00      0.89      0.94       149

    accuracy                           0.95       299
   macro avg       0.95      0.95      0.95       299
weighted avg       0.95      0.95      0.95       299



In [18]:
clf.predict(['you have won lottery ticket worth $2000, please click here to claim',
            'hi, how are you doing today?'])

array(['spam', 'ham'], dtype=object)

# Save and Load Model


In [19]:
import pickle

pickle.dump(clf, open('model.pkl', 'wb'))

In [20]:
model = pickle.load(open('model.pkl', 'rb'))

clf.predict(['you have won lottery ticket worth $2000, please click here to claim',
            'hi, how are you doing today?'])

array(['spam', 'ham'], dtype=object)