# Phishing Classifier 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

### Importing DataSet

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/ektanegi25/Phishing-Classifier/main/Phishing_Legitimate_full.csv")

In [3]:
# Overview of the dataset
df.head()

In [4]:
# display the rows and columns of the dataset
df.shape

In [5]:
# Dropping 'id' column because it is not required
df.drop(labels='id',axis=1,inplace=True)

In [6]:
# Installing sweetviz library to analyze and visualize the data

In [7]:
#!pip install sweetviz

In [8]:
#import sweetviz as sv

In [9]:
#report = sv.analyze(df)
#report.show_html("report.html")

In [10]:
#!pip install autoviz

In [11]:
#from autoviz.AutoViz_Class import AutoViz_Class
#AV = AutoViz_Class()

In [12]:
#auto_report = AV.AutoViz("https://raw.githubusercontent.com/ektanegi25/Phishing-Classifier/main/Phishing_Legitimate_full.csv", depVar='CLASS_LABEL')

In [13]:
# to see information of Data
df.info()

In [14]:
# Changing Dtype from int64, float64 to int32, float32 

In [15]:
float_cols = df.select_dtypes('float64').columns

In [16]:
for cols in float_cols:
    df[cols] = df[cols].astype('float32')

In [17]:
int_cols = df.select_dtypes('int64').columns

In [18]:
for cols in int_cols:
    df[cols] = df[cols].astype('int32')

In [19]:
# converted 64bits to 32bits
df.info()

In [20]:
# Renaming label column from CLASS_LABEL to label

In [21]:
df.rename(columns={'CLASS_LABEL':'Labels'},inplace=True)

In [22]:
df.head()

In [23]:
# some random sample from the dataset

In [24]:
df.sample(10)

In [25]:
# Basic Statistics of the dataset

In [26]:
df.describe()

In [27]:
df['Labels'].value_counts().plot(kind='bar')
plt.show()

In [28]:
## Making function to see correlation of features

In [29]:
def corr_heatmap(data,idx_s,idx_e):
    
    y = data['Labels']
    
    temp = data.iloc[:,idx_s:idx_e]
    
    if 'id' in temp.columns:
        del temp['id']
        
    temp['Labels'] = y
    
    sns.heatmap(temp.corr(),annot=True)
    plt.show()


In [30]:
plt.figure(figsize=(15,10))
pd.set_option('display.max_columns',None)
#plt.rcParams['figure.figsize'] = (15,10)
corr_heatmap(df,0,10)

In [31]:
corr_heatmap(df,10,20)

In [32]:
corr_heatmap(df,20,30)

In [33]:
corr_heatmap(df,30,40)

In [34]:
# Using sklearn for feature Selection

In [35]:
from sklearn.feature_selection import mutual_info_classif

In [36]:
X = df.drop(labels='Labels',axis=1)

In [37]:
X

In [38]:
y = df[['Labels']]

In [39]:
y

In [40]:
#discrete_features = X.dtypes == int

In [41]:
mi_score = mutual_info_classif(X,y)

In [42]:
mi_score = pd.Series(mi_score,name="Mi Score",index = X.columns)

In [43]:
mi_score = mi_score.sort_values(ascending=False)

In [None]:
mi_score

In [45]:
def plot_mi_scores(scores):
    
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width,scores)
    plt.yticks(width,ticks)
    plt.title("MI Scores")

In [46]:
plt.figure(figsize=(12,12))
plot_mi_scores(mi_score)

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [48]:
def train_logistic(data,top_n):
    
    top_n_features = mi_score.sort_values(ascending=False).head(top_n).index.tolist()
    
    X = data[top_n_features]
    y = data['Labels']
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,shuffle=True)
    
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    
    y_pred = lr.predict(X_test)
    
    precision = precision_score(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    
    return precision,accuracy,recall,f1

In [49]:
log_scores = []

In [50]:
for i in range(20,51):
    
    precision,accuracy,recall,f1 = train_logistic(df,i)
    
    log_scores.append([i,precision,accuracy,recall,f1])

In [51]:
log_scores

In [52]:
# converting scores in DataFrame

In [53]:
score_log = pd.DataFrame(log_scores,columns=['top_features','precision','accuracy','recall','f1'])

In [54]:
score_log

In [55]:
# Creating a plot of performance score for better decision

In [56]:
font_dict = {
    'family': 'serif',  # Font family ('serif', 'sans-serif', 'monospace', etc.)
    'style': 'italic',  # Font style ('normal', 'italic', 'oblique')
    'weight': 'bold',   # Font weight ('normal', 'bold', 'heavy', 'light', etc.)
    'size': 20,         # Font size
}

In [57]:
sns.lineplot(data=score_log,x='top_features',y='precision',label='Precision Score')
sns.lineplot(data=score_log,x='top_features',y='recall',label='recall Score')
sns.lineplot(data=score_log,x='top_features',y='f1',label='fi Score')
sns.lineplot(data=score_log,x='top_features',y='accuracy',label='accuracy Score')

plt.grid(True,linestyle='--',alpha=0.6)
plt.xlabel('Top Features',fontdict=font_dict)
plt.ylabel('Scores',fontdict=font_dict)
plt.xticks(list(range(20,51)),fontsize=15)
plt.title('Performance Metrics of logistic Regression',fontdict=font_dict)
plt.figure(figsize=(5,5))
plt.show()

In [58]:
# Here we can we see selecting top 27 or 28 features can give a good score of all performance metrics
# This Scores are given by Logistic regression model

In [59]:
# random Forest Classifier

In [60]:
def train_rfc(data,top_n):
    
    top_n_features = mi_score.sort_values(ascending=False).head(top_n).index.tolist()
    
    X = data[top_n_features]
    y = data['Labels']
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,shuffle=True)
    
    rfc = RandomForestClassifier(n_estimators=200,criterion='entropy',max_depth=32,max_features=1.0)
    rfc.fit(X_train,y_train)
    
    y_pred = rfc.predict(X_test)
    
    precision = precision_score(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    
    return precision,accuracy,recall,f1

In [61]:
rfc_scores = []

In [None]:
for i in range(20,51):
    
    precision,accuracy,recall,f1 = train_rfc(df,i)
    print(f'{i} Precision:{precision} Accuracy:{accuracy} Recall:{recall} F1:{f1}')
    rfc_scores.append([precision,accuracy,recall,f1])

In [None]:
# converting rfc scores into DataFrame

In [None]:
score_rfc = pd.DataFrame(rfc_scores,columns=['top_features','precision','accuracy','recall','f1'])

In [None]:
score_rfc

In [None]:
# Creating a plot of performance score for better decision

In [None]:
sns.lineplot(data=score_log,x='top_features',y='precision',label='Precision Score')
sns.lineplot(data=score_log,x='top_features',y='recall',label='recall Score')
sns.lineplot(data=score_log,x='top_features',y='f1',label='fi Score')
sns.lineplot(data=score_log,x='top_features',y='accuracy',label='accuracy Score')

plt.grid(True,linestyle='--',alpha=0.6)
plt.xlabel('Top Features',fontdict=font_dict)
plt.ylabel('Scores',fontdict=font_dict)
plt.xticks(list(range(20,51)),fontsize=15)
plt.title('Performance Metrics',fontdict=font_dict)
plt.figure(figsize=(5,5))
plt.show()

In [None]:
# Here we can we see selecting top 27 or 28 features can give a good score of all performance metrics
# This Scores are given by Random Forest Classifier model