#### Classifying Emails
<p> Classifying whether email is spam or not spam using binary model classification</p>

#### Step 1 - Import the modules

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

#### Step 2 - Load the dataset

In [23]:
data = pd.read_csv('spam.csv')

In [24]:
data

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


#### Step - Clean the dataset

In [25]:
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
data.tail()

Unnamed: 0,Label,EmailText
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5572 non-null   object
 1   EmailText  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [28]:
data.describe()

Unnamed: 0,Label,EmailText
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [29]:
data.isnull()

Unnamed: 0,Label,EmailText
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
5567,False,False
5568,False,False
5569,False,False
5570,False,False


#### Step 3 - Split the dataset into features (Input sets) and labels(Output sets)

In [30]:
X = data['EmailText'].values
y = data['Label'].values

<p> Spliting dataset into X_train, X_test and y_train, y_test </p>

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state = 0,
    test_size = 0.2
)

In [32]:
X_train

array(['No no:)this is kallis home ground.amla home town is durban:)',
       'I am in escape theatre now. . Going to watch KAVALAN in a few minutes',
       'We walked from my moms. Right on stagwood pass right on winterstone left on victors hill. Address is &lt;#&gt;',
       ...,
       "I was at bugis juz now wat... But now i'm walking home oredi... Ã\x8cÃ\x8f so late then reply... I oso saw a top dat i like but din buy... Where r Ã\x8c_ now? ",
       ":-) yeah! Lol. Luckily i didn't have a starring role like you!",
       'How dare you stupid. I wont tell anything to you. Hear after i wont talk to you:-.'],
      dtype=object)

#### Step 4 - Convert string to Integer

In [33]:
convertor = CountVectorizer()
X_train = convertor.fit_transform(X_train)
X_test = convertor.transform(X_test)


In [34]:
print(X_train)

  (0, 4711)	2
  (0, 6732)	1
  (0, 3689)	2
  (0, 3827)	1
  (0, 3422)	2
  (0, 3203)	1
  (0, 906)	1
  (0, 6881)	1
  (0, 2465)	1
  (1, 898)	1
  (1, 3582)	2
  (1, 2605)	1
  (1, 6698)	1
  (1, 4761)	1
  (1, 3123)	1
  (1, 6814)	1
  (1, 7269)	1
  (1, 3839)	1
  (1, 2775)	1
  (1, 4429)	1
  (2, 3689)	1
  (2, 7289)	1
  (2, 7233)	1
  (2, 2973)	1
  (2, 4587)	1
  :	:
  (4454, 5827)	1
  (4454, 3819)	1
  (4454, 2301)	1
  (4454, 4914)	1
  (4454, 7235)	1
  (4455, 7552)	1
  (4455, 7579)	1
  (4455, 3302)	1
  (4455, 4045)	1
  (4455, 2277)	1
  (4455, 4120)	1
  (4455, 5713)	1
  (4455, 4187)	1
  (4455, 6335)	1
  (4456, 6814)	2
  (4456, 7579)	3
  (4456, 3466)	1
  (4456, 6639)	1
  (4456, 814)	1
  (4456, 6581)	1
  (4456, 7457)	2
  (4456, 961)	1
  (4456, 3324)	1
  (4456, 2141)	1
  (4456, 6436)	1


In [35]:
X_test

<1115x7619 sparse matrix of type '<class 'numpy.int64'>'
	with 13941 stored elements in Compressed Sparse Row format>

In [36]:
print(X_test)

  (0, 842)	1
  (0, 1880)	1
  (0, 3815)	1
  (0, 3962)	1
  (0, 5152)	1
  (0, 6019)	1
  (0, 6814)	1
  (0, 6847)	1
  (0, 7063)	1
  (1, 2733)	1
  (1, 4870)	1
  (1, 6696)	1
  (1, 7261)	1
  (2, 890)	1
  (2, 924)	1
  (2, 983)	1
  (2, 1010)	1
  (2, 1956)	1
  (2, 1996)	1
  (2, 2350)	2
  (2, 2675)	1
  (2, 2775)	1
  (2, 2894)	1
  (2, 3242)	1
  (2, 3357)	1
  :	:
  (1114, 402)	1
  (1114, 890)	1
  (1114, 924)	1
  (1114, 1540)	1
  (1114, 1897)	1
  (1114, 3068)	1
  (1114, 3072)	1
  (1114, 3302)	1
  (1114, 3689)	2
  (1114, 3700)	1
  (1114, 3927)	1
  (1114, 4329)	1
  (1114, 4916)	1
  (1114, 5992)	1
  (1114, 6432)	1
  (1114, 6576)	2
  (1114, 6690)	1
  (1114, 6696)	2
  (1114, 6814)	3
  (1114, 7284)	1
  (1114, 7309)	1
  (1114, 7355)	1
  (1114, 7365)	2
  (1114, 7579)	1
  (1114, 7584)	1


#### Step 6 - Creating the model

In [37]:
model = SVC(random_state = 0, kernel = 'rbf')

In [38]:
model

SVC(random_state=0)

#### Step7 - Train the model

In [39]:
model.fit(X_train, y_train)

SVC(random_state=0)

In [40]:
print(f"Accuracy level : {model.score(X_test, y_test)}")

Accuracy level : 0.9766816143497757
