# Spam Email Detection Using RandomForestClassifier

In [104]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

## Importing Data

In [105]:
df = pd.read_csv("spam.csv",encoding='latin1')

In [106]:
data= df.where((pd.notnull(df)),'')

In [107]:
data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


## Cleaning the dataset

In [108]:
data.drop(columns= ["Unnamed: 2",	"Unnamed: 3",	"Unnamed: 4"], inplace = True)

In [109]:
data.rename(columns={"v1":"Category", "v2": "Messages"}, inplace= True)

In [110]:
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1

In [111]:
data.head()

Unnamed: 0,Category,Messages
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


## Splitting the data into Training set and Test set 

In [112]:
X_train, X_test, Y_train,Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [113]:
print("X_train:\n", X_train.head())
print("X_test: \n", X_test.head())
print("y_train \n", Y_train.head())
print("y_test: \n", Y_test.head())

X_train:
 1978    No I'm in the same boat. Still here at my moms...
3989    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935       They r giving a second chance to rahul dengra.
4078       O i played smash bros  &lt;#&gt;  religiously.
4086    PRIVATE! Your 2003 Account Statement for 07973...
Name: Messages, dtype: object
X_test: 
 3245    Funny fact Nobody teaches volcanoes 2 erupt, t...
944     I sent my scores to sophas and i had to do sec...
1044    We know someone who you know that fancies you....
2484    Only if you promise your getting out as SOON a...
812     Congratulations ur awarded either å£500 of CD ...
Name: Messages, dtype: object
y_train 
 1978    1
3989    0
3935    1
4078    1
4086    0
Name: Category, dtype: object
y_test: 
 3245    1
944     1
1044    0
2484    1
812     0
Name: Category, dtype: object


## Vectorizing Using TF-IDF Vectorizer

In [122]:
feature_extraction = TfidfVectorizer( min_df = 1, stop_words = "english", lowercase = True , max_features = 5000)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train= Y_train.astype("int")
Y_test = Y_test.astype("int")

## Creating and Training the RandomForestClassifer model

In [120]:
rf_classifier_model = RandomForestClassifier()
rf_classifier_model.fit(X_train_features, Y_train)

## Predicting and finding Accuracy

In [121]:
Y_pred_rf = rf_classifier_model.predict(X_test_features)
accuracy_rf = accuracy_score(Y_test,Y_pred_rf)
print(f"Accuracy: { accuracy_rf*100:.2f}")

Accuracy: 97.85
