In [None]:
# Import the required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [None]:
# loading dataset to dataframe
raw_mail_data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Spam Mail Prediction/mail_data.csv',)
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# replace the null values with a null string
df = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [None]:
df.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.shape

(5572, 2)

In [None]:
# label Encoding for category column
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Category']=le.fit_transform(df['Category'])
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# separate the data to text and label
x=df['Message']
y=df['Category']

In [None]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [None]:
print(y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64


In [None]:
# spliting data to train and test
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=3, stratify=y)
print(x.shape,x_train.shape, x_test.shape)

(5572,) (4457,) (1115,)


In [None]:
# Feature Extraction
feature=TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_feature=feature.fit_transform(x_train)
x_test_feature=feature.transform(x_test)
print(x_train_feature)

  (0, 7329)	0.39151450331197035
  (0, 2596)	0.5157331716075019
  (0, 4795)	0.6459507464707183
  (0, 6736)	0.40433070936297943
  (1, 1793)	0.43486660333673016
  (1, 4861)	0.596185515774092
  (1, 3112)	0.31103507183699425
  (1, 3758)	0.2826422333927384
  (1, 6887)	0.528038275197618
  (2, 2903)	1.0
  (3, 5081)	0.4169087023760639
  (3, 7198)	0.3971508483254661
  (3, 3373)	0.26859638268284747
  (3, 4040)	0.24099748417300504
  (3, 4692)	0.43001182720880177
  (3, 3909)	0.3260348921371232
  (3, 758)	0.37620667903348365
  (3, 3092)	0.32479862316475455
  (4, 3911)	0.2511783165875194
  (4, 3082)	0.4766800108257892
  (4, 5766)	0.6833422922401592
  (4, 6339)	0.37251069778964124
  (4, 3373)	0.23999265394731062
  (4, 4040)	0.21533285461106833
  (5, 4413)	0.4460096390714086
  :	:
  (4452, 1180)	0.8777703340143531
  (4452, 4770)	0.4790816639408472
  (4453, 1853)	0.5659242420057378
  (4453, 5704)	0.5659242420057378
  (4453, 1891)	0.4268643677817285
  (4453, 5098)	0.31370317391845537
  (4453, 5831)	0.280

In [None]:
# training the model
model=LogisticRegression()
model.fit(x_train_feature,y_train)
# Evaluating the model
pred_train=model.predict(x_train_feature)
acc=accuracy_score(pred_train,y_train)
print('Train accuracy score: ',acc)

pred_test=model.predict(x_test_feature)
acc=accuracy_score(y_test,pred_test)
print('Test accuracy score: ', acc)

Train accuracy score:  0.9681400044873233
Test accuracy score:  0.9641255605381166


In [None]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==0):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Ham mail
