# Mail Spam Prediction

### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style('whitegrid')

### Importing the dataset

In [2]:
df = pd.read_csv('mail_data.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [6]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### Encode the Category to numeric value i.e. 0 and 1
* 1 for spam
* 0 for ham

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['Category'] = le.fit_transform(df['Category'])

In [9]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Now splitting the data into texts and labels

In [10]:
X = df['Message']
y = df['Category']

### Splitting the data into train and test set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

### Feature Extraction

##### Transform the text data to feature vectors that can be used as input to the machine learning models

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase='True')

X_train_feature = feature_extraction.fit_transform(X_train)
X_test_feature = feature_extraction.transform(X_test)

### Fitting the different model into the training set

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier, XGBRFClassifier

#### Defining a function fit_model for model fitting

In [14]:
def fit_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    print('The predicted values:\n ', model.predict(X_test_feature))
    print('\nThe accuracy score: ', model.score(X_test_feature, y_test))

### 1 Logistic Regression

In [15]:
fit_model(LogisticRegression(), X_train_feature, X_test_feature, y_train, y_test)

The predicted values:
  [0 0 0 ... 0 0 1]

The accuracy score:  0.9524663677130045


### 2. Decision Tree Classifier

In [16]:
fit_model(DecisionTreeClassifier(), X_train_feature, X_test_feature, y_train, y_test)

The predicted values:
  [0 0 0 ... 0 0 1]

The accuracy score:  0.9650224215246637


### 3. Random Forest Classifier

In [17]:
fit_model(RandomForestClassifier(), X_train_feature, X_test_feature, y_train, y_test)

The predicted values:
  [0 0 0 ... 0 0 1]

The accuracy score:  0.9739910313901345


### 4. SVC

In [18]:
fit_model(SVC(kernel='linear'), X_train_feature, X_test_feature, y_train, y_test)

The predicted values:
  [0 0 0 ... 0 0 1]

The accuracy score:  0.97847533632287


### 5. XGB Classifier

In [19]:
fit_model(XGBClassifier(), X_train_feature, X_test_feature, y_train, y_test)

The predicted values:
  [0 0 0 ... 0 0 1]

The accuracy score:  0.9704035874439462


### 6. XGBRF Classifier

In [20]:
fit_model(XGBRFClassifier(), X_train_feature, X_test_feature, y_train, y_test)

The predicted values:
  [0 0 0 ... 0 0 1]

The accuracy score:  0.9246636771300448
