In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [2]:
mail_dataset=pd.read_csv('/content/mail_data.csv')

In [3]:
mail_dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail_dataset.shape

(5572, 2)

In [5]:
mail_dataset['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [6]:
mail_dataset.isna()

Unnamed: 0,Category,Message
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
5567,False,False
5568,False,False
5569,False,False
5570,False,False


In [8]:
mail_dataset.isnull().value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Category,Message,Unnamed: 2_level_1
False,False,5572


In [9]:
mail_dataset.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [10]:
#replace the null values with a null string
mail_dataset_clean=mail_dataset.where((pd.notnull(mail_dataset)),'')

In [11]:
mail_dataset_clean.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
#label encoding

#label spam mail as 0; ham mail as 1;

mail_dataset_clean.loc[mail_dataset_clean['Category']=='spam','Category',]=0
mail_dataset_clean.loc[mail_dataset_clean['Category']=='ham','Category']=1

In [13]:
x=mail_dataset_clean['Message']
y=mail_dataset_clean['Category']

print(x)
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [17]:
#train test spliting

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)


In [18]:
print(x.shape,x_train.shape,x_test.shape)

(5572,) (4457,) (1115,)


In [23]:
#feature extraction

#converting text values to numerical values
#transfrom the text data to feature vectors  that can be used as input to the logistic regression
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [24]:
x_train_feature=feature_extraction.fit_transform(x_train)
x_test_feature=feature_extraction.transform(x_test)

In [25]:
#convert the y_train and y_test values as integers
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [26]:
print(x_train)
print(x_test)

5426        Oh yeah! And my diet just flew out the window
4724                     HELLO PEACH! MY CAKE TASTS LUSH!
536     Good afternoon, my love! How goes that day ? I...
3488                        Change windows logoff sound..
2551    Please sen :)my kind advice :-)please come her...
                              ...                        
1697    Sorry man, my stash ran dry last night and I c...
422     Someone has contacted our dating service and e...
4007    IM FINE BABES AINT BEEN UP 2 MUCH THO! SAW SCA...
3474                      You getting back any time soon?
3074           Somebody should go to andros and steal ice
Name: Message, Length: 4457, dtype: object
3978    Great NEW Offer - DOUBLE Mins & DOUBLE Txt on ...
1831                   That's the way you should stay oh.
3297    Hi there. We have now moved in2 our pub . Woul...
2072         Good night my dear.. Sleepwell&amp;Take care
4246                          Is toshiba portege m100 gd?
                             

In [27]:
print(x_train_feature)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34895 stored elements and shape (4457, 7496)>
  Coords	Values
  (0, 4768)	0.2885879313347367
  (0, 7438)	0.2996693624522654
  (0, 2262)	0.49316930861935127
  (0, 3764)	0.22046319970004669
  (0, 2823)	0.5172500796081709
  (0, 7289)	0.5172500796081709
  (1, 3317)	0.3290434493347565
  (1, 4972)	0.49481520325330874
  (1, 1558)	0.42364007209989546
  (1, 6517)	0.49481520325330874
  (1, 4136)	0.4717788963273523
  (2, 3103)	0.17628376831968728
  (2, 841)	0.26799944639874834
  (2, 4099)	0.186263215205624
  (2, 3086)	0.27449720225122765
  (2, 2136)	0.180851695270251
  (2, 3398)	0.20665621299033204
  (2, 4269)	0.2543939099135892
  (2, 3118)	0.18009671431232455
  (2, 3935)	0.3671145612703168
  (2, 3722)	0.24768901862403342
  (2, 6641)	0.20096909705626312
  (2, 1430)	0.28509060215711635
  (2, 5837)	0.1845655907506494
  (2, 4943)	0.33789703751914013
  :	:
  (4454, 841)	0.21705430485365426
  (4454, 3514)	0.17954863693268575
  (4454, 7163)	

In [28]:
print(x_test_feature)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7643 stored elements and shape (1115, 7496)>
  Coords	Values
  (0, 45)	0.23885705786351533
  (0, 398)	0.2831628958086886
  (0, 1283)	0.19658332365071185
  (0, 1585)	0.2137047703002642
  (0, 2046)	0.1984270278883612
  (0, 2375)	0.466320953046431
  (0, 2906)	0.28559070500052114
  (0, 3154)	0.17077412764771363
  (0, 3912)	0.20823705036803863
  (0, 4365)	0.2003761424782757
  (0, 4419)	0.2358899506086862
  (0, 4625)	0.1629132197579507
  (0, 4755)	0.2137047703002642
  (0, 4827)	0.21680692811499552
  (0, 5030)	0.24210715613503428
  (0, 6513)	0.2657436287350355
  (0, 6885)	0.15260537497993798
  (1, 4768)	0.5087543968611168
  (1, 6271)	0.681815528764269
  (1, 7196)	0.5256391808173945
  (2, 1866)	0.24268764995095132
  (2, 2062)	0.44326719810193344
  (2, 3154)	0.2848556310871354
  (2, 3339)	0.27401894009219313
  (2, 3539)	0.47232298297806397
  :	:
  (1111, 6001)	0.2671794976463982
  (1111, 6591)	0.29456140032492945
  (1111, 7141)	0.190

In [30]:
# training the model ,in this case logistic regression model

model=LogisticRegression()

In [31]:
#training the logistic regression model with the training data
model.fit(x_train_feature,y_train)

In [32]:
# evaluation and accuracy score checking
#training data
prediction_on_training_data=model.predict(x_train_feature)
accuracy_check_training_data=accuracy_score(y_train,prediction_on_training_data)
print(accuracy_check_training_data)

0.9672425398249944


In [35]:
# evaluation and accuracy score checking
#testing data
prediction_on_test_data=model.predict(x_test_feature)
accuracy_check_test_data=accuracy_score(y_test,prediction_on_test_data)
print(accuracy_check_test_data)

0.9704035874439462


In [38]:
input_mail=["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
# convert text to feature vectors
input_data_features=feature_extraction.transform(input_mail)
#making predictions
prediction=model.predict(input_data_features)
print(prediction)

if prediction[0]==1:
  print("this is a ham mail")
else:
  print("this is a spam mail")

[0]
this is a spam mail
