## Sample Project: Email Spam Check

In [1]:
#import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('spam.csv') #import data

In [3]:
# .head to find the top 5 of the csv file
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#use .shape function to find the number of (rows,columns)
data.shape

(5572, 2)

In [5]:
# .tail is used to find the last 5 inputs of the csv file
data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [6]:
# .info to find the information about the csv file
# non-null means that all the rows and columns are filled out (none missing)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
# .describe is like info but more simplified
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [8]:
# Assigning the data into a variable
x = data["Message"]
y = data["Category"]

In [9]:
# Making 4 variables (1 for training and testing each)
# Use either train_size or test_size to select how many percent you want to train/test the data(convert it into decimal)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [10]:
# Finding how much is trained
x_train.shape

(4457,)

In [11]:
# Finding how much will be tested
x_test.shape

(1115,)

In [12]:
# Using the Vectorizer module and fitting it inside a variable to be used
feature_extraction = TfidfVectorizer(min_df= 1, lowercase=True)

In [13]:
# Convert textual data to Numerical data

In [14]:
x_train_feature = feature_extraction.fit_transform(x_train)
# transforming it directly so it does not cause error
x_test_feature = feature_extraction.transform(x_test)

y_train_feature = feature_extraction.fit_transform(y_train)
y_test_feature = feature_extraction.fit_transform(y_test)

In [15]:
# Displaying its numerical value as verification
print(x_train_feature)

  (0, 297)	0.20207372351025735
  (0, 2121)	0.25847543257087846
  (0, 1456)	0.2808254719369605
  (0, 5880)	0.20997752336719724
  (0, 6745)	0.26775153701816085
  (0, 4001)	0.19690358844839764
  (0, 3008)	0.12853719408139058
  (0, 192)	0.2808254719369605
  (0, 1596)	0.10872034110036567
  (0, 1163)	0.22305145828599685
  (0, 1665)	0.1700989893725156
  (0, 523)	0.20500704399560934
  (0, 4976)	0.11738225000680504
  (0, 3458)	0.18423020887885214
  (0, 6782)	0.24540149765207886
  (0, 1874)	0.20207372351025735
  (0, 6954)	0.07071544646026288
  (0, 4838)	0.2512803269770981
  (0, 2829)	0.21808097891440895
  (0, 7736)	0.2101212040434813
  (0, 3739)	0.0968573545171375
  (0, 6872)	0.12450139728287406
  (0, 2165)	0.26775153701816085
  (0, 2179)	0.1606931290521137
  (1, 4770)	0.16498303107070791
  :	:
  (4454, 848)	0.34831911434039614
  (4454, 2372)	0.4160903038216821
  (4454, 4832)	0.24369106743289456
  (4454, 3251)	0.2799876424743721
  (4454, 4243)	0.28067590685442695
  (4454, 7504)	0.274402159737450

In [16]:
# Using LogisticRegression to train the data

In [17]:
model = LogisticRegression()

model.fit(x_train_feature, y_train)


LogisticRegression()

In [18]:
# Model testing and evaluation on training data

In [19]:
pred_on_training_data = model.predict(x_train_feature)
accuracy_on_training_data = accuracy_score(y_train, pred_on_training_data)

print(accuracy_on_training_data)

0.9741978909580435


In [20]:
# Model testing and evaluation on testing data

In [21]:
pred_on_testing_data = model.predict(x_test_feature)
accuracy_on_testing_data = accuracy_score(y_test, pred_on_testing_data)

print(accuracy_on_testing_data)

0.9695067264573991
