Importing needed libraries 

In [74]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

Data collection and preprocessing 

In [75]:
#loading data from csv file 
mail_data = pd.read_csv('/content/mail_data.csv')

In [76]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [77]:
mail_data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [78]:
#replace the null values with null string 
mail_data = mail_data.where((pd.notnull(mail_data)),'')

In [79]:
mail_data.shape

(5572, 2)

In [80]:
#label spam mail as 0 , ham mail as 1
mail_data.loc[mail_data['Category'] =='spam' , 'Category',]= 0
mail_data.loc[mail_data['Category'] =='ham' , 'Category',]= 1

In [81]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [82]:
# separating data and target 
X = mail_data['Message']
Y = mail_data['Category']

In [84]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [85]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [86]:
#Stemming 
port_stem= PorterStemmer()
def stemming (content) : 
  stemmed_content=re.sub('[^a-zA-Z]',' ',content)
  stemmed_content= stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content 

In [87]:
stemming(" U don't know how stubborn! I am. I didn't even want to go to the hospital. I kept telling Mark I'm not a weak sucker. Hospitals are for weak suckers.")

'u know stubborn even want go hospit kept tell mark weak sucker hospit weak sucker'

In [99]:
X=X.apply(stemming)

In [102]:
#coverting the textual data to numerical data 
vectorizer= TfidfVectorizer()
vectorizer.fit(X)
#converting all X values to their respective features 
X=vectorizer.transform(X)
print(X)

  (0, 6135)	0.23616756554565888
  (0, 5957)	0.19460776670194488
  (0, 4091)	0.24055424511726686
  (0, 2932)	0.28506031120996994
  (0, 2827)	0.3522946643655987
  (0, 2245)	0.19460776670194488
  (0, 2208)	0.1649859743034801
  (0, 2171)	0.14066343975170745
  (0, 1169)	0.27282796669086984
  (0, 964)	0.29761995607435426
  (0, 738)	0.29761995607435426
  (0, 736)	0.33630333732147566
  (0, 379)	0.26350491969128115
  (0, 190)	0.3522946643655987
  (1, 6056)	0.44597659211687757
  (1, 3785)	0.564793662023427
  (1, 3760)	0.2809319560263009
  (1, 2960)	0.4218982744467187
  (1, 2794)	0.4745440766926726
  (2, 6101)	0.21369536090695063
  (2, 6067)	0.16011115093017092
  (2, 5695)	0.13727833879237866
  (2, 5536)	0.2476330040187214
  (2, 5420)	0.1320245012320154
  (2, 5131)	0.22058857181065877
  :	:
  (5567, 784)	0.15667410716389937
  (5567, 724)	0.2920652264491494
  (5568, 2457)	0.37457404553349233
  (5568, 2171)	0.29597505521175127
  (5568, 1996)	0.5740672391289212
  (5568, 1704)	0.6652366917601374
  (5

In [103]:
# splitting data into train and test set 
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=2,stratify=Y)

In [104]:
#convert Y type to integer 
Y_train = Y_train1.astype('int')
Y_test = Y_test1.astype('int')

In [105]:
model= LogisticRegression()

In [107]:
model.fit(X_train,Y_train)

LogisticRegression()

In [108]:
prediction_on_training_data = model.predict(X_train)

In [109]:
accuracy_on_training_set1 = accuracy_score(Y_train,prediction_on_training_data)

In [110]:
print('accuracy on training data : ',accuracy_on_training_set1)

accuracy on training data :  0.9715054969710568


In [111]:
prediction_on_test_data = model.predict(X_test)

In [113]:
accuracy_on_training_set = accuracy_score(Y_test,prediction_on_test_data)

In [115]:
print('accuracy on test data : ',accuracy_on_training_set)

accuracy on test data :  0.9713004484304932


Building a predictive system