In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # convert text data into numerical value
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# import dataset
mail_dataset=pd.read_csv("/content/mail_data.csv")

In [8]:
mail_dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
 # number of rows and columns in the dataframe
 mail_dataset.shape

(5572, 2)

In [5]:
# check null values in each column
mail_dataset.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
# information about dataset
mail_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


# Label Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le=LabelEncoder()

In [14]:
mail_dataset['Category']=le.fit_transform(mail_dataset['Category'])  # 0 for not spam,1 for spam

In [16]:
mail_dataset.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
# seperate the dependent(label) and independent variables(features)
X=mail_dataset['Message']
y=mail_dataset['Category']

In [18]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [19]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64

In [20]:
# Splitting the dataset into training data & testing data
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [21]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((4457,), (1115,), (4457,), (1115,))

# Feature Extraction

In [28]:
# convert the text data into numeric data
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)


In [30]:
print(X_train_features)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

In [31]:
# convert Y_train and Y_test values as integers
y_train.info()

<class 'pandas.core.series.Series'>
Index: 4457 entries, 1978 to 860
Series name: Category
Non-Null Count  Dtype
--------------  -----
4457 non-null   int64
dtypes: int64(1)
memory usage: 69.6 KB


# Train The Model

In [32]:
model=LogisticRegression()

In [35]:
# train the Logistic model on training data
model.fit(X_train_features,y_train)

In [36]:
# prediction on training data
train_pred=model.predict(X_train_features)

In [38]:
# accuracy on training data
training_accuracy=accuracy_score(y_train,train_pred)

In [40]:
print("the accuracy on training data:",training_accuracy)

the accuracy on training data: 0.9661207089970832


# model evaluation on testing data

In [41]:
# prediction on testing data
test_pred=model.predict(X_test_features)

In [42]:
# accuracy on testing data
testing_accuracy=accuracy_score(y_test,test_pred)

In [43]:
print("the accuracy on testing data:",testing_accuracy)

the accuracy on testing data: 0.967713004484305


**Making a Predictive System**

In [56]:
input_data=["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005."]
# convert text into numeric value
input_data=feature_extraction.transform(input_data)

# making prediction
prediction=model.predict(input_data)
print(prediction)

if(prediction[0]==0):
  print("Email is Not Harm")
else:
  print("Email is Harm")

[0]
Email is Not Harm


In [65]:
input_data=[ "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"]
# convert text into numeric value
input_data=feature_extraction.transform(input_data)

# making prediction
prediction=model.predict(input_data)
print(prediction)

if(prediction[0]==0):
  print("Email is  Ham")
else:
  print("Email is Spam")

[0]
Email is  Ham
