### Mount Google Drive and import necessary libraries

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from pprint import pprint
from sklearn.naive_bayes import CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

##Data Preprocessing

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/dataset_updt.csv')
dataset.head(5)

Unnamed: 0,slno,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,port,HTTPS_token,Request_URL,URL_of_Anchor,Links_in_tags,SFH,Submitting_to_email,Abnormal_URL,Redirect,on_mouseover,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,-1,0,1,1,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,1,1,-1,1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,1,1,-1,1,0,-1,-1,-1,-1,0,1,1,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,0,-1,1,1,0,1,1,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,1,1,1,1,0,0,-1,1,1,0,-1,1,-1,1,-1,-1,0,-1,1,1,1,1


In [None]:
dataset['Result'].value_counts()

 1    6157
-1    4898
Name: Result, dtype: int64

In [None]:
reduced_df = dataset[['having_IP_Address', 'URL_Length', 'Shortining_Service',
       'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix',
       'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length',
       'Favicon', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor',
       'Links_in_tags', 'SFH', 'Submitting_to_email', 'Redirect', 'on_mouseover', 'RightClick', 'Iframe',
       'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank',
       'Statistical_report', 'Result']]

In [None]:
def convertEncodingToPositive(dataframe):

  mapping = {-1: 2, 0: 0, 1: 1}

  col_map = {}

  for col in dataframe:
    col_map[col] = mapping

  for i in range(dataframe.shape[0]):
    for j in range(dataframe.shape[1]):
      dataframe.loc[i][j] = mapping[dataframe.loc[i][j]]

In [None]:
convertEncodingToPositive(reduced_df)

In [None]:
print(reduced_df.head())

   having_IP_Address  URL_Length  ...  Statistical_report  Result
0                  2           1  ...                   2       2
1                  1           1  ...                   1       2
2                  1           0  ...                   2       2
3                  1           0  ...                   1       2
4                  1           0  ...                   1       1

[5 rows x 26 columns]


In [None]:
X_train = reduced_df[['having_IP_Address', 'URL_Length', 'Shortining_Service',
       'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix',
       'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length',
       'Favicon', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor',
       'Links_in_tags', 'SFH', 'Submitting_to_email', 'Redirect', 'on_mouseover', 'RightClick', 'Iframe',
       'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank',
       'Statistical_report']]
y_train = reduced_df['Result']
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=4, stratify= y_train)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [None]:
X_train_red = X_train.to_numpy()
X_test_red = X_test.to_numpy()
y_train_red = y_train.to_numpy()
y_test_red = y_test.to_numpy()

In [None]:

for train, test in kf.split(X_train_red):
  print(X_train_red[train].shape, y_train_red[train].shape, X_train_red[test].shape, y_train_red[test].shape)

(7075, 25) (7075,) (1769, 25) (1769,)
(7075, 25) (7075,) (1769, 25) (1769,)
(7075, 25) (7075,) (1769, 25) (1769,)
(7075, 25) (7075,) (1769, 25) (1769,)
(7076, 25) (7076,) (1768, 25) (1768,)


In [None]:
import pickle
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
X_train_red_onehot = encoder.fit_transform(X_train_red)
X_test_red_onehot = encoder.transform(X_test_red)
pickle.dump(encoder, open("One_Hot_Encoder", 'wb'))

##Model Training


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

for train,test in kf.split(X_test_red_onehot):
  model.fit(X_train_red_onehot[train], y_train_red[train])
  print(metrics.classification_report(y_test_red[test], model.predict(X_test_red_onehot[test])))
  print("##################################################################################")


              precision    recall  f1-score   support

           1       0.95      0.95      0.95       259
           2       0.92      0.93      0.93       184

    accuracy                           0.94       443
   macro avg       0.94      0.94      0.94       443
weighted avg       0.94      0.94      0.94       443

##################################################################################
              precision    recall  f1-score   support

           1       0.90      0.94      0.92       228
           2       0.94      0.89      0.91       214

    accuracy                           0.92       442
   macro avg       0.92      0.92      0.92       442
weighted avg       0.92      0.92      0.92       442

##################################################################################
              precision    recall  f1-score   support

           1       0.93      0.95      0.94       255
           2       0.92      0.90      0.91       187

    accuracy    

In [None]:
#Random Forest with Grid searchCv
model_rf = RandomForestClassifier(max_features= 'log2' ,n_estimators=200)
for train,test in kf.split(X_test_red):
  model_rf.fit(X_train_red_onehot[train], y_train_red[train])
  print(metrics.classification_report(y_test_red[test], model_rf.predict(X_test_red_onehot[test])))
  print("##################################################################################")


              precision    recall  f1-score   support

           1       0.96      0.97      0.96       259
           2       0.96      0.94      0.95       184

    accuracy                           0.96       443
   macro avg       0.96      0.95      0.96       443
weighted avg       0.96      0.96      0.96       443

##################################################################################
              precision    recall  f1-score   support

           1       0.94      0.96      0.95       228
           2       0.95      0.93      0.94       214

    accuracy                           0.95       442
   macro avg       0.95      0.95      0.95       442
weighted avg       0.95      0.95      0.95       442

##################################################################################
              precision    recall  f1-score   support

           1       0.95      0.96      0.96       255
           2       0.95      0.94      0.94       187

    accuracy    

In [None]:
#XGBoost


xgb_model = xgb.XGBClassifier(silent=False, 
                        scale_pos_weight=1,
                        learning_rate=0.01,  
                        colsample_bytree = 0.4,
                        subsample = 0.8,
                        objective='binary:logistic', 
                        n_estimators=1000, 
                        reg_alpha = 0.3,
                        max_depth=4, 
                        gamma=10)

for train,test in kf.split(X_test_red):
  xgb_model.fit(X_train_red_onehot[train], y_train_red[train])
  print(metrics.classification_report(y_test_red[test], xgb_model.predict(X_test_red_onehot[test])))
  print("##################################################################################")


# parameters = {'nthread':[4],
#               'objective':['binary:logistic'],
#               'learning_rate': [0.05], 
#               'max_depth': [6,7,8],
#               'min_child_weight': [11],
#               'silent': [1],
#               'subsample': [0.8],
#               'colsample_bytree': [0.7],
#               'n_estimators': [5],
#               'missing':[-999],
#               'seed': [1337]}


# clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
#                    cv=StratifiedKFold(n_splits=5, shuffle=True), 
#                    scoring='roc_auc',
#                    verbose=2, refit=True)

# clf.fit(X_train_red_onehot, y_train_red)

              precision    recall  f1-score   support

           1       0.95      0.91      0.93       259
           2       0.88      0.93      0.91       184

    accuracy                           0.92       443
   macro avg       0.92      0.92      0.92       443
weighted avg       0.92      0.92      0.92       443

##################################################################################
              precision    recall  f1-score   support

           1       0.88      0.93      0.91       228
           2       0.93      0.87      0.90       214

    accuracy                           0.90       442
   macro avg       0.90      0.90      0.90       442
weighted avg       0.90      0.90      0.90       442

##################################################################################
              precision    recall  f1-score   support

           1       0.95      0.93      0.94       255
           2       0.91      0.93      0.92       187

    accuracy    

In [None]:
#SVM classifier

model_svm = SVC(C=10, gamma=0.1, kernel='rbf', probability=True)

for train,test in kf.split(X_test_red):
  model_svm.fit(X_train_red_onehot[train], y_train_red[train])
  print(metrics.classification_report(y_test_red[test], model_svm.predict(X_test_red_onehot[test])))
  print("##################################################################################")

              precision    recall  f1-score   support

           1       0.96      0.96      0.96       259
           2       0.94      0.95      0.94       184

    accuracy                           0.95       443
   macro avg       0.95      0.95      0.95       443
weighted avg       0.95      0.95      0.95       443

##################################################################################
              precision    recall  f1-score   support

           1       0.94      0.96      0.95       228
           2       0.96      0.93      0.95       214

    accuracy                           0.95       442
   macro avg       0.95      0.95      0.95       442
weighted avg       0.95      0.95      0.95       442

##################################################################################
              precision    recall  f1-score   support

           1       0.95      0.96      0.96       255
           2       0.95      0.93      0.94       187

    accuracy    

In [None]:
#Saving SVM Model 
svm_clf = SVC(kernel='rbf', gamma = 0.1, C = 10, probability=True)
svm_clf = svm_clf.fit(X_train_red_onehot, y_train_red)
tp, fn, fp, tn = metrics.confusion_matrix(y_test_red, svm_clf.predict(X_test_red_onehot)).ravel()
accuracy = (tn+tp)/(fp+fn+tp+tn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
fpr = fp/(fp+tn)
f1 = 2*precision*recall/(precision + recall)
pickle.dump(svm_clf, open("SVM_Final_Model", 'wb'))
print(accuracy)



0.9633649932157394


In [None]:
filename_dt = 'SVM_Final_Model'
loaded_model = pickle.load(open(filename_dt, 'rb'))
tp, fn, fp, tn = metrics.confusion_matrix(y_test_red, loaded_model.predict(X_test_red_onehot)).ravel()
accuracy = (tn+tp)/(fp+fn+tp+tn)
recall = tp/(tp+fn)
fpr = fp/(fp+tn)
print(accuracy)
print(recall)
print(fpr)

0.9633649932157394
0.9756295694557271
0.05204081632653061
