In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import metrics 
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
warnings.filterwarnings('ignore')

In [13]:
#Loading data into dataframe

data = pd.read_csv("datasets/phishing-website-detector/phishing.csv")
data.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [14]:
data = data.drop(['Index'],axis = 1)

In [15]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UsingIP,11054.0,0.313914,0.949495,-1.0,-1.0,1.0,1.0,1.0
LongURL,11054.0,-0.633345,0.765973,-1.0,-1.0,-1.0,-1.0,1.0
ShortURL,11054.0,0.738737,0.674024,-1.0,1.0,1.0,1.0,1.0
Symbol@,11054.0,0.700561,0.713625,-1.0,1.0,1.0,1.0,1.0
Redirecting//,11054.0,0.741632,0.670837,-1.0,1.0,1.0,1.0,1.0
PrefixSuffix-,11054.0,-0.734938,0.678165,-1.0,-1.0,-1.0,-1.0,1.0
SubDomains,11054.0,0.064049,0.817492,-1.0,-1.0,0.0,1.0,1.0
HTTPS,11054.0,0.25104,0.911856,-1.0,-1.0,1.0,1.0,1.0
DomainRegLen,11054.0,-0.336711,0.941651,-1.0,-1.0,-1.0,1.0,1.0
Favicon,11054.0,0.628551,0.777804,-1.0,1.0,1.0,1.0,1.0


In [16]:
X = data.drop(["class"],axis =1)
y = data["class"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8843, 30), (8843,), (2211, 30), (2211,))

In [18]:
gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)

# fit the model 
gbc.fit(X_train,y_train)

In [20]:
y_train_gbc = gbc.predict(X_train)
y_test_gbc = gbc.predict(X_test)

In [22]:
acc_train_gbc = metrics.accuracy_score(y_train,y_train_gbc)
acc_test_gbc = metrics.accuracy_score(y_test,y_test_gbc)
print("Gradient Boosting Classifier : Accuracy on training Data: {:.3f}".format(acc_train_gbc))
print("Gradient Boosting Classifier : Accuracy on test Data: {:.3f}".format(acc_test_gbc))
print()

Gradient Boosting Classifier : Accuracy on training Data: 0.989
Gradient Boosting Classifier : Accuracy on test Data: 0.974



In [23]:
print(metrics.classification_report(y_test, y_test_gbc))

              precision    recall  f1-score   support

          -1       0.99      0.96      0.97       976
           1       0.97      0.99      0.98      1235

    accuracy                           0.97      2211
   macro avg       0.98      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211



## Hyperparameter Tuning

In [45]:
training_accuracy = []
test_accuracy = []
# try learning_rate from 0.1 to 0.9
depth = range(1,10)
for n in depth:
    forest_test =  GradientBoostingClassifier(learning_rate = n*0.1, max_depth=5)
    forest_test.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(forest_test.score(X_train, y_train))
    # record generalization accuracy
    test_accuracy.append(forest_test.score(X_test, y_test))
    print("Learning Rate {}, had a train accuracy of {}".format(n*0.1, forest_test.score(X_train, y_train)))
    print("Learning Rate {}, had a test accuracy of {}".format(n*0.1, forest_test.score(X_test, y_test)))
    print('Difference between test and train accuracies {}'.format((float(forest_test.score(X_test, y_test))) -  float(forest_test.score(X_train, y_train))))
    print('*' * 20)

Learning Rate 0.1, had a train accuracy of 0.9726337215876965
Learning Rate 0.1, had a test accuracy of 0.9638172772501131
Difference between test and train accuracies -0.008816444337583329
********************
Learning Rate 0.2, had a train accuracy of 0.9822458441705304
Learning Rate 0.2, had a test accuracy of 0.9665309814563546
Difference between test and train accuracies -0.01571486271417577
********************
Learning Rate 0.30000000000000004, had a train accuracy of 0.9883523691055072
Learning Rate 0.30000000000000004, had a test accuracy of 0.9728629579375848
Difference between test and train accuracies -0.015489411167922351
********************
Learning Rate 0.4, had a train accuracy of 0.989596290851521
Learning Rate 0.4, had a test accuracy of 0.9701492537313433
Difference between test and train accuracies -0.0194470371201777
********************
Learning Rate 0.5, had a train accuracy of 0.9907271288024426
Learning Rate 0.5, had a test accuracy of 0.9706015377657169
Diffe

## Hyperparamter Tuning results
#### These are the best hyperparameters for the GBC model:
    - max_depth: 5
    - learning_rate: 0.7

In [47]:
model = GradientBoostingClassifier(max_depth=5, learning_rate=0.7)

In [49]:
model.fit(X_train, y_train)

## Save the model

In [51]:
import pickle
# save the model to disk
filename = 'gbc_model.sav'
pickle.dump(model, open(filename, 'wb')) 

## Load the Model

In [53]:
model_loaded = pickle.load(open(filename, 'rb'))

In [56]:
import flask

In [None]:
#importing required libraries

from flask import Flask, request, render_template
import numpy as np
import pandas as pd
from sklearn import metrics 
import warnings
import joblib
import pickle
warnings.filterwarnings('ignore')
from feature import FeatureExtraction

#gbc = joblib.load('model/phishing_url_model.joblib')

file = open("gbc_model.sav","rb")
gbc = pickle.load(file)
file.close()


app = Flask(__name__)

@app.route("/", methods=["GET", "POST"])
def index():
    if request.method == "POST":

        url = request.form["url"]
        obj = FeatureExtraction(url)
        x = np.array(obj.getFeaturesList()).reshape(1,30) 

        y_pred =gbc.predict(x)[0]
        #1 is safe       
        #-1 is unsafe
        y_pro_phishing = gbc.predict_proba(x)[0,0]
        y_pro_non_phishing = gbc.predict_proba(x)[0,1]
        # if(y_pred ==1 ):
        pred = "It is {0:.2f} % safe to go ".format(y_pro_phishing*100)
        return render_template('index.html',xx =round(y_pro_non_phishing,2),url=url )
    return render_template("index.html", xx =-1)


if __name__ == "__main__":
    app.run(debug=True)+