## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [32]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# importing Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    GradientBoostingClassifier,
    AdaBoostClassifier,
    RandomForestClassifier
)
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier

# importing eluvation matrecis
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

#### Import the CSV Data as Pandas DataFrame

In [6]:
df = pd.read_csv(r'..\Network_Data\phisingData.csv')

#### Show Top 5 Records

In [7]:
df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


### dropping duplicates

In [16]:
df = df.drop_duplicates()

#### Preparing X and Y variables

In [17]:
X = df.drop(columns=['Result'],axis=1)
X.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,1,-1,-1,-1,-1,1,1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,1,-1,-1,0,-1,1,1,1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,1,-1,1,-1,1,0,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,1,-1,-1,1,-1,1,-1,1
4,1,0,-1,1,1,-1,1,1,-1,1,...,1,-1,1,-1,-1,0,-1,1,1,1


In [18]:
y = df['Result']
y=y.replace(-1,0)
y

0        0
1        0
2        0
3        0
4        1
        ..
11037    0
11045    1
11048    1
11049    0
11054    0
Name: Result, Length: 5849, dtype: int64

In [19]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [20]:
X_train.shape, X_test.shape

((4679, 30), (1170, 30))

#### Create an Evaluate Function to give all metrics after model Training

In [23]:
def evaluate_model(true, predicted):
    f1_scores = f1_score(true, predicted)
    recall = recall_score(true, predicted)
    precision = precision_score(true, predicted)
    confusion_mat = confusion_matrix(true, predicted)
    return f1_scores, recall, precision, confusion_mat

In [33]:
models = {
                "Random Forest": RandomForestClassifier(verbose=1),
                "Decision Tree": DecisionTreeClassifier(),
                "Gradient Boosting": GradientBoostingClassifier(verbose=1),
                "Logistic Regression": LogisticRegression(verbose=1),
                "AdaBoost": AdaBoostClassifier(),
                "KNeighbors Classifier": KNeighborsClassifier(),
                "Xgboost": XGBClassifier(),
                "xg_rf_classifier":XGBRFClassifier(),
                "catboost": CatBoostClassifier(),
            }


model_list = []
f1_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_f1 , model_train_recall, model_train_precision, model_train_matrix = evaluate_model(y_train, y_train_pred)

    model_test_f1 , model_test_recall, model_test_precision, model_test_matrix= evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- f1 score: {:.4f}".format(model_train_f1))
    print("- recall score: {:.4f}".format(model_train_recall))
    print("- precision Score: {:.4f}".format(model_train_precision))
    print("- confusion matrix :\n {}".format(model_train_matrix))

    print('----------------------------------')
    
    print("- f1 score: {:.4f}".format(model_test_f1))
    print("- recall score: {:.4f}".format(model_test_recall))
    print("- precision Score: {:.4f}".format(model_test_precision))
    print("- confusion matrix :\n {}".format(model_test_matrix))
    f1_list.append(model_test_f1)
    
    print('='*35)
    print('\n')

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


Random Forest
Model performance for Training set
- f1 score: 0.9908
- recall score: 0.9899
- precision Score: 0.9917
- confusion matrix :
 [[2380   19]
 [  23 2257]]
----------------------------------
- f1 score: 0.9485
- recall score: 0.9545
- precision Score: 0.9425
- confusion matrix :
 [[588  32]
 [ 25 525]]


Decision Tree
Model performance for Training set
- f1 score: 0.9907
- recall score: 0.9816
- precision Score: 1.0000
- confusion matrix :
 [[2399    0]
 [  42 2238]]
----------------------------------
- f1 score: 0.9228
- recall score: 0.9127
- precision Score: 0.9331
- confusion matrix :
 [[584  36]
 [ 48 502]]


      Iter       Train Loss   Remaining Time 
         1           1.2489            1.29s
         2           1.1370            1.32s
         3           1.0440            1.01s
         4           0.9659            1.07s
         5           0.8996            1.05s
         6           0.8432            0.87s
         7           0.7947            0.96s
       

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


AdaBoost
Model performance for Training set
- f1 score: 0.9310
- recall score: 0.9443
- precision Score: 0.9181
- confusion matrix :
 [[2207  192]
 [ 127 2153]]
----------------------------------
- f1 score: 0.9267
- recall score: 0.9309
- precision Score: 0.9225
- confusion matrix :
 [[577  43]
 [ 38 512]]


KNeighbors Classifier
Model performance for Training set
- f1 score: 0.9464
- recall score: 0.9404
- precision Score: 0.9525
- confusion matrix :
 [[2292  107]
 [ 136 2144]]
----------------------------------
- f1 score: 0.9094
- recall score: 0.9036
- precision Score: 0.9153
- confusion matrix :
 [[574  46]
 [ 53 497]]


Xgboost
Model performance for Training set
- f1 score: 0.9855
- recall score: 0.9855
- precision Score: 0.9855
- confusion matrix :
 [[2366   33]
 [  33 2247]]
----------------------------------
- f1 score: 0.9583
- recall score: 0.9600
- precision Score: 0.9565
- confusion matrix :
 [[596  24]
 [ 22 528]]


xg_rf_classifier
Model performance for Training set
- f

## Results

In [34]:
pd.DataFrame(list(zip(model_list, f1_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
6,Xgboost,0.958258
8,catboost,0.955656
0,Random Forest,0.948509
2,Gradient Boosting,0.941389
3,Logistic Regression,0.927484
4,AdaBoost,0.926697
1,Decision Tree,0.922794
5,KNeighbors Classifier,0.909424
7,xg_rf_classifier,0.908127


In [35]:
classi_model = XGBClassifier(fit_intercept=True)
classi_model = classi_model.fit(X_train, y_train)
y_pred = classi_model.predict(X_test)
score = f1_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 95.83
