In [None]:
# Import python Libraraies from scikit-Learn 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,recall_score

In [None]:
# Load dataset
df=pd.read_csv('spam.csv',encoding ='latin1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
# check column list present in a df
df.columns


Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [None]:
# check descriptive statistics 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
#check the number of rows and columns present in a df
print('rows =',df.shape[0])
print('columns =',df.shape[1])

rows = 5572
columns = 5


In [None]:
# lets see null value counts in df
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [None]:
df.isnull().mean()*100 # check the percentage of a null value

v1             0.000000
v2             0.000000
Unnamed: 2    99.102656
Unnamed: 3    99.784637
Unnamed: 4    99.892319
dtype: float64

**As we can see that there are huge number of missinf entries in a unnamed 2, unnamed 3, unnamed 4 column i.e more that 99% ar.so, we should remove these columns**

In [None]:
df.drop(columns=df[['Unnamed: 2','Unnamed: 3','Unnamed: 4']],axis=1,inplace=True)

In [None]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
df.shape

(5572, 2)

In [None]:
#Rename columns names for easy to understand, we can also use df.rename
df.columns=['spam/ham','sms']

In [None]:
df

Unnamed: 0,spam/ham,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
#Convert the text data into numericals data
df.loc[df['spam/ham']== 'spam','spam/ham']=0
df.loc[df['spam/ham']== 'ham','spam/ham']=1

In [None]:
df

Unnamed: 0,spam/ham,sms
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will Ì_ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [None]:
#Divide x and y parameter to train the data frame 
x=df.sms
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: sms, Length: 5572, dtype: object

In [None]:
y=df['spam/ham']
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: spam/ham, Length: 5572, dtype: object

In [None]:
#Divide the entire dataset into training and testing setfor model training 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3)

In [None]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


In [None]:
x_train

3075    Mum, hope you are having a great day. Hoping t...
1787                           Yes:)sura in sun tv.:)lol.
1614    Me sef dey laugh you. Meanwhile how's my darli...
4304                Yo come over carlos will be here soon
3266                    Ok then i come n pick u at engin?
                              ...                        
789                          Gud mrng dear hav a nice day
968             Are you willing to go for aptitude class.
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha. Why boss?
Name: sms, Length: 4457, dtype: object

In [None]:
x_test

2632                       I WILL CAL YOU SIR. In meeting
454     Loan for any purpose å£500 - å£75,000. Homeown...
983     LOOK AT THE FUCKIN TIME. WHAT THE FUCK YOU THI...
1282    Ever green quote ever told by Jerry in cartoon...
4610                                  Wat time Ì_ finish?
                              ...                        
4827    Lol no. Just trying to make your day a little ...
5291      Xy trying smth now. U eat already? We havent...
3325    Huh so fast... Dat means u havent finished pai...
3561    Still chance there. If you search hard you wil...
1136    Dont forget you can place as many FREE Request...
Name: sms, Length: 1115, dtype: object

In [None]:
y_train

3075    1
1787    1
1614    1
4304    1
3266    1
       ..
789     1
968     1
1667    1
3321    1
1688    1
Name: spam/ham, Length: 4457, dtype: object

In [None]:
y_test

2632    1
454     0
983     1
1282    1
4610    1
       ..
4827    1
5291    1
3325    1
3561    1
1136    0
Name: spam/ham, Length: 1115, dtype: object

**As we know that machine learning algorithm works well in the numericals value so we have to convert all the text data into numbers.To do so we use Tfidfvectorizer technique from feature_extraction of sklearn**

In [None]:
feat_vect=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
feat_vect

In [None]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [None]:
x_test_vec=feat_vect.fit_transform(x_test)

In [None]:
x_train_vec=feat_vect.transform(x_train)

In [None]:
print(x_train)

3075    Mum, hope you are having a great day. Hoping t...
1787                           Yes:)sura in sun tv.:)lol.
1614    Me sef dey laugh you. Meanwhile how's my darli...
4304                Yo come over carlos will be here soon
3266                    Ok then i come n pick u at engin?
                              ...                        
789                          Gud mrng dear hav a nice day
968             Are you willing to go for aptitude class.
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha. Why boss?
Name: sms, Length: 4457, dtype: object


In [None]:
x_train_vec,x_test_vec

(<4457x3238 sparse matrix of type '<class 'numpy.float64'>'
 	with 26318 stored elements in Compressed Sparse Row format>,
 <1115x3238 sparse matrix of type '<class 'numpy.float64'>'
 	with 8720 stored elements in Compressed Sparse Row format>)

In [None]:
print(x_train_vec)

  (0, 2813)	0.19191459074830192
  (0, 1924)	0.31862789566493077
  (0, 1841)	0.33730031597995963
  (0, 1686)	0.24768754419800812
  (0, 1436)	0.33730031597995963
  (0, 1433)	0.22901512388297932
  (0, 1377)	0.27960825756341023
  (0, 1321)	0.4537455817484589
  (0, 883)	0.37813022727202467
  (0, 294)	0.30537960261455754
  (1, 3212)	0.3742516592048949
  (1, 2951)	0.4468807181403369
  (1, 2742)	0.5206949506203641
  (1, 2731)	0.4990448710462816
  (1, 1722)	0.3742516592048949
  (2, 1647)	1.0
  (3, 3218)	0.5969552853954876
  (3, 2611)	0.4472956302720023
  (3, 770)	0.37133915462752837
  (3, 664)	0.5528817582616348
  (4, 2132)	0.508948564340775
  (4, 2027)	0.3608949927886259
  (4, 1053)	0.6737740520045983
  (4, 770)	0.395922580665547
  (5, 3066)	0.38932140577104934
  :	:
  (4451, 2859)	0.5277021905869128
  (4451, 2132)	0.4916694357883569
  (4451, 620)	0.5172350821764307
  (4452, 1969)	0.3743901393984206
  (4452, 1910)	0.499656507661237
  (4452, 1373)	0.44960002365839075
  (4452, 1336)	0.4203188575

In [None]:
print(x_test_vec)

  (0, 1840)	0.5470775878936475
  (0, 2563)	0.5149396017383439
  (0, 633)	0.6599570587439945
  (1, 2813)	0.1623769474001259
  (1, 121)	0.26958776220642855
  (1, 22)	0.23137073373406264
  (1, 1205)	0.15919444481136824
  (1, 1394)	0.4137552551149475
  (1, 2335)	0.25837851868099354
  (1, 2224)	0.26958776220642855
  (1, 3106)	0.2496839539259005
  (1, 2801)	0.26958776220642855
  (1, 1428)	0.26958776220642855
  (1, 1)	0.2226761689789696
  (1, 238)	0.26958776220642855
  (1, 210)	0.21557219231256677
  (1, 2261)	0.26958776220642855
  (1, 1714)	0.2496839539259005
  (2, 2835)	0.39312526272940845
  (2, 1223)	0.43336283778599577
  (2, 2860)	0.3362295403293851
  (2, 1225)	0.5554066201719753
  (2, 1729)	0.48592423391824763
  (3, 2010)	0.28009022728448785
  (3, 1336)	0.23561627348325437
  :	:
  (1111, 2594)	0.4675085813886858
  (1111, 2942)	0.4099856948072808
  (1111, 1375)	0.4169391674217092
  (1112, 2080)	0.4448076693011006
  (1112, 1131)	0.40271290258485304
  (1112, 1455)	0.3780890425792432
  (1112,

In [None]:
logi=LogisticRegression()

In [None]:
logi.fit(x_train_vec,y_train)

In [None]:
logi.score(x_train_vec,y_train)

0.9748709894547902

In [None]:
logi.score(x_test_vec,y_test)

0.9479820627802691

In [None]:
pred_logi=logi.predict(x_test_vec)
pred_logi

array([1, 0, 1, ..., 1, 1, 1])

In [None]:
accuracy_score(y_test,pred_logi)

0.9479820627802691

In [None]:
confusion_matrix(y_test,pred_logi)

array([[ 98,  57],
       [  1, 959]])

In [None]:
print(classification_report(y_test,pred_logi))

              precision    recall  f1-score   support

           0       0.99      0.63      0.77       155
           1       0.94      1.00      0.97       960

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.87      1115
weighted avg       0.95      0.95      0.94      1115

