In [1]:
import numpy as np
import pandas as pd

In [2]:
#loading the dataset
df = pd.read_csv("Phising_Training_Dataset.csv")
df.head()

Unnamed: 0,key,having_IP,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,12344,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,12345,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,12346,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,12347,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,12348,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [3]:
#loading the test dataset
df_test = pd.read_csv("Phising_Testing_Dataset.csv")
df_test.head()

Unnamed: 0,key,having_IP,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
0,21338,1,1,1,1,1,1,-1,1,-1,...,1,1,1,1,1,1,1,-1,0,1
1,21339,1,-1,1,1,1,-1,0,-1,1,...,1,1,1,-1,-1,0,-1,-1,0,1
2,21340,1,-1,1,1,1,-1,0,0,-1,...,-1,-1,-1,1,-1,1,-1,1,1,-1
3,21341,-1,-1,-1,1,-1,-1,-1,-1,1,...,1,-1,-1,1,-1,1,-1,1,1,-1
4,21342,1,-1,1,1,1,-1,1,1,-1,...,1,1,1,1,1,1,-1,1,0,1


### Performing Exploratory Data Analysis

In [4]:
df.shape

(8955, 32)

In [5]:
#1 represent the legitimate and -1 represents phishing
df.Result.unique()

array([-1,  1], dtype=int64)

In [6]:
df_test.shape

(2100, 31)

In [7]:
df.sum().isna()

key                            False
having_IP                      False
URL_Length                     False
Shortining_Service             False
having_At_Symbol               False
double_slash_redirecting       False
Prefix_Suffix                  False
having_Sub_Domain              False
SSLfinal_State                 False
Domain_registeration_length    False
Favicon                        False
port                           False
HTTPS_token                    False
Request_URL                    False
URL_of_Anchor                  False
Links_in_tags                  False
SFH                            False
Submitting_to_email            False
Abnormal_URL                   False
Redirect                       False
on_mouseover                   False
RightClick                     False
popUpWidnow                    False
Iframe                         False
age_of_domain                  False
DNSRecord                      False
web_traffic                    False
P

In [8]:
#getting the features and target column
X = df.drop('Result', axis = 1)
y = df.Result
X.shape

(8955, 31)

In [9]:
#dividing the dataset into training and test for model building and evaluation
from sklearn.model_selection import train_test_split

In [10]:
#dataset is divided in the ratio 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
X_train.shape

(6268, 31)

## Machine Learning Models & Training

From the dataset above, it is clear that this is a supervised machine learning task. There are two major types of supervised machine learning problems, called classification and regression.

This data set comes under classification problem, as the input URL is classified as phishing (-1) or legitimate (1). The supervised machine learning models (classification) considered to train the dataset in this notebook are:

-Decision Tree

-Random Forest

-Logistic Regression

-Support Vector Machines

In [11]:
#training using different machine learning models
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [12]:
svm_model = svm.SVC(gamma = 'auto', C =1, kernel = 'linear')
svm_model.fit(X_train, y_train)
print("SVM: Accuracy on training Data: {:.3f}".format(svm_model.score(X_train, y_train)))
print("SVM : Accuracy on test Data: {:.3f}".format(svm_model.score(X_test, y_test)))

SVM: Accuracy on training Data: 0.916
SVM : Accuracy on test Data: 0.912


In [13]:
rf_model= RandomForestClassifier(n_estimators = 5)
rf_model.fit(X_train, y_train)
print("Random forest: Accuracy on training Data: {:.3f}".format(rf_model.score(X_train, y_train)))
print("Random forest: Accuracy on test Data: {:.3f}".format(rf_model.score(X_test, y_test)))

Random forest: Accuracy on training Data: 0.995
Random forest: Accuracy on test Data: 0.956


In [14]:
log_model = LogisticRegression(solver='liblinear', multi_class='auto', C = 1)
log_model.fit(X_train, y_train)
print("Logistic Regression: Accuracy on training Data: {:.3f}".format(log_model.score(X_train, y_train)))
print("Logistic Regression: Accuracy on test Data: {:.3f}".format(log_model.score(X_test, y_test)))

Logistic Regression: Accuracy on training Data: 0.929
Logistic Regression: Accuracy on test Data: 0.926


In [15]:
dt_model = DecisionTreeClassifier(max_depth = 5)
dt_model.fit(X_train, y_train)
print("Decision Tree: Accuracy on training Data: {:.3f}".format(dt_model.score(X_train, y_train)))
print("Decision Tree: Accuracy on test Data: {:.3f}".format(dt_model.score(X_test, y_test)))

Decision Tree: Accuracy on training Data: 0.927
Decision Tree: Accuracy on test Data: 0.923


#### Best accuracy was given by random forest classifier with 99% Accuracy on training Data and Accuracy 95% on test Data

In [20]:
y_predicted = rf_model.predict(df_test)
print(y_predicted)
output = pd.DataFrame(y_predicted)

[ 1 -1 -1 ... -1  1  1]


In [17]:
keys = df_test['key']
keys.head()

0    21338
1    21339
2    21340
3    21341
4    21342
Name: key, dtype: int64

In [18]:
df3 = pd.concat([keys, output],ignore_index = True, axis = 1)
df3.columns = ['key', 'Result']
df3.head()

Unnamed: 0,key,Result
0,21338,1
1,21339,-1
2,21340,-1
3,21341,-1
4,21342,1


#### storing the result in submission.csv file

In [19]:
df3.to_csv("submission.csv",index = False)