# **Machine Learning for Phising detection**

## 1. Importing Necessary Data

This section helps to import necessary libraries for the data training

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
#loading the data
data = pd.read_csv("url.csv")
data.head()


Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,Label
0,confirmprofileaccount.com,0,0,0,0,0,1,1,0,1,1,1,1,1
1,marreme.com,0,0,0,2,0,1,0,0,1,1,1,1,1
2,modsecpaststudents.com,0,0,0,1,0,1,0,0,1,1,1,1,1
3,docs.google.com,0,0,1,5,0,1,0,0,1,1,1,1,1
4,oportunidadedasemana.com,0,0,1,1,1,1,0,0,1,1,1,1,1


## 2. Checking Data columns

In [None]:
data.columns

In [None]:
data.info()


In [None]:
data.hist(bins = 50, figsize = (15, 15)) #bins is parameter for integer or sequence of string
plt.show()

In [None]:
plt.figure(figsize=(15,13))
sns.heatmap(data.corr())
plt.show()

## 3. Processing the Data

In [None]:
data.describe()

the unecessary data for machine learning data is dropped 

In [None]:
data = data.drop(['Domain'], axis = 1).copy()

In [None]:
data.isnull().sum()

In [None]:
data = data.sample(frac=1).reset_index(drop=True)
data.head() #shuffling the data

## 4. Splitting the Data

In [None]:
y = data['Label']
X = data.drop('Label',axis=1)
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

## 5. Machine Learning Models and Techniques

There are different models and trainings for machine learning. The website is said to be legitimate if the input URL yields output 0 and phising if the result yields 1.




In [None]:
from sklearn.metrics import accuracy_score
# Creating holders to store the model performance results
ML_Model = []
acc_train = []
acc_test = []

#function to call for storing the results
def storeResults(model, a,b):
  ML_Model.append(model)
  acc_train.append(round(a, 3))
  acc_test.append(round(b, 3))
  

## Decision Tree Classifier 

In [None]:
# Decision Tree model 
from sklearn.tree import DecisionTreeClassifier

# instantiate the model 
tree = DecisionTreeClassifier(max_depth = 5)
# fit the model 
tree.fit(X_train, y_train)

In [None]:
#predicting the target value from the model for the samples
y_test_tree = tree.predict(X_test)
y_train_tree = tree.predict(X_train)

In [None]:
#computing the accuracy of the model performance
acc_train_tree = accuracy_score(y_train,y_train_tree)
acc_test_tree = accuracy_score(y_test,y_test_tree)

print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_tree))
print("Decision Tree: Accuracy on test Data: {:.3f}".format(acc_test_tree))

In [None]:
#checking the feature improtance in the model
plt.figure(figsize=(9,7))
n_features = X_train.shape[1]
plt.barh(range(n_features), tree.feature_importances_, align='center')
plt.yticks(np.arange(n_features), X_train.columns)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.show()

In [None]:
#storing the results. The below mentioned order of parameter passing is important.
#Caution: Execute only once to avoid duplications.
storeResults('Decision Tree', acc_train_tree, acc_test_tree)

## XG Boost Classifier

In [None]:

#XGBoost Classification model
from xgboost import XGBClassifier

# instantiate the model
xgb = XGBClassifier(learning_rate=0.4,max_depth=7)
#fit the model
xgb.fit(X_train, y_train)

In [None]:

#predicting the target value from the model for the samples
y_test_xgb = xgb.predict(X_test)
y_train_xgb = xgb.predict(X_train)

In [None]:
#computing the accuracy of the model performance
acc_train_xgb = accuracy_score(y_train,y_train_xgb)
acc_test_xgb = accuracy_score(y_test,y_test_xgb)

print("XGBoost: Accuracy on training Data: {:.3f}".format(acc_train_xgb))
print("XGBoost : Accuracy on test Data: {:.3f}".format(acc_test_xgb))

In [None]:
#storing the results. The below mentioned order of parameter passing is important.
#Caution: Execute only once to avoid duplications.
storeResults('XGBoost', acc_train_xgb, acc_test_xgb)

## Comparision of Models

In [None]:

#creating dataframe
results = pd.DataFrame({ 'ML Model': ML_Model,    
    'Train Accuracy': acc_train,
    'Test Accuracy': acc_test})
results 

In [None]:
import pickle as pkl
pkl.dump(xgb, open("XGBoostClassifier.pickle.dat", "wb"))