In [1]:
# Description of this notebook

# in this notebook we train and test the performance of well known classifier algorithms(as this is a classifier problem) using some evaluation metrics
# and save the best model in a pickle file

In [2]:
# importing necessary modules for this notebook

import pandas as pd
from sklearn.preprocessing import scale #for scaling
from sklearn.model_selection import train_test_split # for train and test data split
from sklearn.linear_model import LogisticRegression # for Logistic Regression Classification model
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score,roc_curve # for evaluation of classification models
from sklearn.tree import DecisionTreeClassifier # for Decision Tree Classifier model
from sklearn.ensemble import RandomForestClassifier # for Random Forest Classifier model
from sklearn.tree import DecisionTreeClassifier # for Decision Tree Classifier model
from sklearn.neighbors import KNeighborsClassifier # for K Neighbor Classifier model
from sklearn.naive_bayes import GaussianNB # for Naive Bayes Classification model
from sklearn.svm import SVC # for svm Classification model
from sklearn.model_selection import GridSearchCV # for hyper parameter tuning using grid search method
from imblearn.over_sampling import SMOTE # for data balancing using smote method

# 2. Loading The Preprocessed Data

In [3]:
# Loading the preprocessed dataset "Preprocessed_Dataset"

data=pd.read_csv("Preprocessed_Dataset.csv")

# Dataset Features

data.shape

(10000, 9)

In [4]:
data.head()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_http,TinyURL,Prefix_Suffix,Ouput
0,0,0,1,1,0,0,0,0,0
1,0,0,1,1,1,0,0,0,0
2,0,0,1,1,0,0,0,0,0
3,0,0,1,3,0,0,0,0,0
4,0,0,1,3,0,0,0,0,0


# 3. X and y split

In [5]:
# splitting the data into independent and target variables X and y repectively

# storing Target column "Ouput" in y
y=data['Ouput']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Ouput, dtype: int64

In [6]:
# storing independent columns in X

X=data.drop(columns=['Ouput'],axis=1)
X.head()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_http,TinyURL,Prefix_Suffix
0,0,0,1,1,0,0,0,0
1,0,0,1,1,1,0,0,0
2,0,0,1,1,0,0,0,0
3,0,0,1,3,0,0,0,0
4,0,0,1,3,0,0,0,0


# 4. Scaling the X values

In [7]:
# As the values are close near in each column so there is no need for scaling

# 5. Test and Train Split

In [8]:
# splitting the data into train and test dataset in 7:3 ratio repectively

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [9]:
X_train.head()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_http,TinyURL,Prefix_Suffix
7681,0,0,1,4,0,0,0,0
9031,0,0,0,0,0,0,0,1
3691,0,0,1,1,0,0,0,0
202,0,0,1,1,0,0,0,0
5625,0,1,1,6,0,0,0,0


In [10]:
X_test.head()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_http,TinyURL,Prefix_Suffix
9394,0,0,1,5,0,0,0,0
898,0,0,1,6,0,0,0,0
2398,0,0,1,3,0,0,0,0
5906,0,0,1,3,0,0,0,0
2343,0,0,1,2,0,0,0,0


In [11]:
y_train.head()

7681    1
9031    1
3691    0
202     0
5625    1
Name: Ouput, dtype: int64

In [12]:
y_test.head()

9394    1
898     0
2398    0
5906    1
2343    0
Name: Ouput, dtype: int64

# 6. Balancing the data

In [14]:
y_train.value_counts()

# Balance the majority and minority class of X

1    3511
0    3489
Name: Ouput, dtype: int64

In [15]:
# Balncing the train data using smote method

smote=SMOTE()

X_train_smote,y_train_smote=smote.fit_resample(X_train,y_train)

In [17]:
y_train_smote.value_counts()

1    3511
0    3511
Name: Ouput, dtype: int64

# 7. Training and Testing the model

In [18]:
# The algorithms used in this model are

#  1.logistic regression
#  2.decision tree model
#  3.Random forest model
#  4.KNN
#  5.Naive Bayes
#  6.svm


## 7.1. Logistic Regression Algorithm 

In [20]:
#initializing the model

lr=LogisticRegression()


In [21]:
# hyper parameter tuning the model

parameters={
          "random_state":[0,1,42]
           }

gslr=GridSearchCV(estimator=lr,param_grid=parameters)

gslr.fit(X_train_smote,y_train_smote)

gslr.best_score_

0.7917956828988858

In [22]:
gslr.best_params_ # finding the best parameters for this model

{'random_state': 0}

In [23]:
lr1=LogisticRegression(random_state=0)

In [24]:
#Training the model

lr1.fit(X_train_smote,y_train_smote)
lr1_train_predict=lr1.predict(X_train_smote)
lr1_train_predict


array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
# Testing The model

lr1_test_predict=lr1.predict(X_test)
lr1_test_predict


array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

## 7.2. Decision Tree Classifier

In [26]:
#initializing the model

dt=DecisionTreeClassifier()

In [27]:
# hyper parameter tuning the model

parameters={
           "criterion":["gini", "entropy", "log_loss"],
           "random_state":[0,1,42]
           }

gsdt=GridSearchCV(estimator=dt,param_grid=parameters)

gsdt.fit(X_train_smote,y_train_smote)

gsdt.best_score_

0.7970644117975079

In [28]:
gsdt.best_params_ # finding the best parameters for this model

{'criterion': 'gini', 'random_state': 0}

In [32]:
dt1=DecisionTreeClassifier(criterion='gini',random_state=0)

In [33]:
#Training the model

dt1.fit(X_train_smote,y_train_smote)
dt1_train_predict=dt1.predict(X_train_smote)
dt1_train_predict


array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [34]:
# Testing The model

dt1_test_predict=dt1.predict(X_test)
dt1_test_predict


array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

## 7.3. Random Forest Classifier

In [35]:
#initializing the model

rf=RandomForestClassifier()

In [36]:
# hyper parameter tuning the model

parameters={
           "criterion":["gini", "entropy", "log_loss"],
           "random_state":[0,1,42]
           }

gsrf=GridSearchCV(estimator=rf,param_grid=parameters)

gsrf.fit(X_train_smote,y_train_smote)

gsrf.best_score_

0.7973491093064047

In [37]:
gsrf.best_params_ # finding the best parameters for this model

{'criterion': 'gini', 'random_state': 1}

In [40]:
rf1=RandomForestClassifier(criterion='gini',random_state=1)

In [41]:
#Training the model

rf1.fit(X_train_smote,y_train_smote)
rf1_train_predict=rf1.predict(X_train_smote)
rf1_train_predict

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [42]:
# Testing The model

rf1_test_predict=rf1.predict(X_test)
rf1_test_predict


array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

## 7.4 K Nearest Neighbors Classifier

In [43]:
#initializing the model

knn=KNeighborsClassifier()

In [81]:
# hyper parameter tuning the model

parameters={
           "n_neighbors":[5,7,9]
           }

gsknn=GridSearchCV(estimator=knn,param_grid=parameters)

gsknn.fit(X_train_smote,y_train_smote)

gsknn.best_score_

0.7644642151047845

In [82]:
gsknn.best_params_ # finding the best parameters for this model

{'n_neighbors': 9}

In [83]:
knn1=KNeighborsClassifier(n_neighbors=9)

In [84]:
#Training the model

knn1.fit(X_train_smote,y_train_smote)
knn1_train_predict=knn1.predict(X_train_smote)
knn1_train_predict

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [85]:
# Testing The model

knn1_test_predict=knn1.predict(X_test)
knn1_test_predict

array([1, 1, 0, ..., 1, 0, 0], dtype=int64)

## 7.5. Naive Bayes Algorithm

In [49]:
#initializing the model

nb1=GaussianNB()

In [50]:
#Training the model

nb1.fit(X_train_smote,y_train_smote)
nb1_train_predict=nb1.predict(X_train_smote)
nb1_train_predict

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [51]:
# Testing The model

nb1_test_predict=nb1.predict(X_test)
nb1_test_predict

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

## 7.6. Support Vector Machine

In [52]:
#initializing the model

svc=SVC()

In [53]:
# hyper parameter tuning the model

parameters={
           "kernel":['linear','rbf'],
           "random_state":[0,1,42]
           }

gssvc=GridSearchCV(estimator=svc,param_grid=parameters)

gssvc.fit(X_train_smote,y_train_smote)

gssvc.best_score_

0.7916542466364531

In [54]:
gssvc.best_params_ # finding the best parameters for this model

{'kernel': 'linear', 'random_state': 0}

In [55]:
svc1=SVC(kernel='linear', random_state= 0)

In [56]:
#Training the model

svc1.fit(X_train_smote,y_train_smote)
svc1_train_predict=svc1.predict(X_train_smote)
svc1_train_predict

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [57]:
# Testing The model

svc1_test_predict=svc1.predict(X_test)
svc1_test_predict

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

## 8. Evaluating the performance of the models

In [58]:
# The models are evaluated using
# 1. accuracy score
# 2. confusion matrix
# 3. classification report#

## 8.1. Accuracy score

In [60]:
# logistic regression Algorithm

print("1. logistic regression Algorithm\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,lr1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,lr1_test_predict))

1. logistic regression Algorithm

On Training data:  0.791085160922814
On Testing Data:  0.796


In [61]:
# Decision Tree Classifier

print("2. Decision Tree Classifier\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,dt1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,dt1_test_predict))

2. Decision Tree Classifier

On Training data:  0.7990600968385075
On Testing Data:  0.804


In [62]:
# Random Forest Classifier

print("3. Random Forest Classifier\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,rf1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,rf1_test_predict))

3. Random Forest Classifier

On Training data:  0.7990600968385075
On Testing Data:  0.8046666666666666


In [86]:
# K Nearest Neighbors

print("4. K Nearest Neighbors\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,knn1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,knn1_test_predict))

4. K Nearest Neighbors

On Training data:  0.7937909427513529
On Testing Data:  0.7913333333333333


In [64]:
# Naive Bayes

print("5. Naive Bayes\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,nb1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,nb1_test_predict))

5. Naive Bayes

On Training data:  0.7919396183423526
On Testing Data:  0.7926666666666666


In [65]:
# Support Vector Machine

print("6. Support Vector Machine\n")
# On Training data
print("On Training data: ",accuracy_score(y_train_smote,svc1_train_predict))

# On Testing Data
print("On Testing Data: ",accuracy_score(y_test,svc1_test_predict))

6. Support Vector Machine

On Training data:  0.7919396183423526
On Testing Data:  0.7926666666666666


In [66]:
# conclusion from Accuracy score

# As there is no major difference between training and testing data
# So there is no overfitting or underfitting

## 8.2. confusion matrix

In [67]:
# logistic regression Algorithm

print("1. logistic regression Algorithm")
confusion_matrix(y_test,lr1_test_predict)

1. logistic regression Algorithm


array([[1454,   57],
       [ 555,  934]], dtype=int64)

In [68]:
# Decision Tree Classifier

print("2. Decision Tree Classifier")
confusion_matrix(y_test,dt1_test_predict)

2. Decision Tree Classifier


array([[1347,  164],
       [ 424, 1065]], dtype=int64)

In [69]:
# Random Forest Classifier

print("3. Random Forest Classifier")
confusion_matrix(y_test,rf1_test_predict)

3. Random Forest Classifier


array([[1347,  164],
       [ 422, 1067]], dtype=int64)

In [87]:
# K Nearest Neighbors

print("4. K Nearest Neighbors")
confusion_matrix(y_test,knn1_test_predict)

4. K Nearest Neighbors


array([[1244,  267],
       [ 359, 1130]], dtype=int64)

In [71]:
# Naive Bayes

print("5. Naive Bayes")
confusion_matrix(y_test,nb1_test_predict)

5. Naive Bayes


array([[1499,   12],
       [ 610,  879]], dtype=int64)

In [89]:
# Support Vector Machine

print("6. Support Vector Machine")
confusion_matrix(y_test,svc1_test_predict)

6. Support Vector Machine


array([[1499,   12],
       [ 610,  879]], dtype=int64)

In [90]:
# conclusion from confusion matrix

# K Nearest neighbors algorithm is performing well for this dataset

## 8.3. Classification Report

In [91]:
# logistic regression Algorithm

print("1. logistic regression Algorithm")
print(classification_report(y_test,lr1_test_predict))

1. logistic regression Algorithm
              precision    recall  f1-score   support

           0       0.72      0.96      0.83      1511
           1       0.94      0.63      0.75      1489

    accuracy                           0.80      3000
   macro avg       0.83      0.79      0.79      3000
weighted avg       0.83      0.80      0.79      3000



In [92]:
# Decision Tree Classifier

print("2. Decision Tree Classifier")
print(classification_report(y_test,dt1_test_predict))

2. Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.76      0.89      0.82      1511
           1       0.87      0.72      0.78      1489

    accuracy                           0.80      3000
   macro avg       0.81      0.80      0.80      3000
weighted avg       0.81      0.80      0.80      3000



In [93]:
# Random Forest Classifier

print("3.  Random Forest Classifier")
print(classification_report(y_test,rf1_test_predict))

3.  Random Forest Classifier
              precision    recall  f1-score   support

           0       0.76      0.89      0.82      1511
           1       0.87      0.72      0.78      1489

    accuracy                           0.80      3000
   macro avg       0.81      0.80      0.80      3000
weighted avg       0.81      0.80      0.80      3000



In [94]:
# K Nearest Neighbors

print("4. K Nearest Neighbors")
print(classification_report(y_test,knn1_test_predict))

4. K Nearest Neighbors
              precision    recall  f1-score   support

           0       0.78      0.82      0.80      1511
           1       0.81      0.76      0.78      1489

    accuracy                           0.79      3000
   macro avg       0.79      0.79      0.79      3000
weighted avg       0.79      0.79      0.79      3000



In [95]:
# Naive Bayes

print("5. Naive Bayes")
print(classification_report(y_test,nb1_test_predict))

5. Naive Bayes
              precision    recall  f1-score   support

           0       0.71      0.99      0.83      1511
           1       0.99      0.59      0.74      1489

    accuracy                           0.79      3000
   macro avg       0.85      0.79      0.78      3000
weighted avg       0.85      0.79      0.78      3000



In [96]:
# Support Vector Machine

print("6. Support Vector Machine")
print(classification_report(y_test,svc1_test_predict))

6. Support Vector Machine
              precision    recall  f1-score   support

           0       0.71      0.99      0.83      1511
           1       0.99      0.59      0.74      1489

    accuracy                           0.79      3000
   macro avg       0.85      0.79      0.78      3000
weighted avg       0.85      0.79      0.78      3000



In [97]:
# conclusion from Classification Report

# K Nearest neighbors algorithm is performing well for this dataset

In [98]:
# conclusion from Evaluating the performance of above mentioned six models

# K Nearest neighbors algorithm is performing well for this dataset based on accuracy score,confusion matrix,classification report

# 9. Selecting and saving the model

In [102]:
# From the performance of each model "K Nearest neighbors algorithm" gives the best performance
# so this algorithm is used to predict the result for our project

# Saving the model

from joblib import Parallel, delayed
import joblib

# save the model as a pickle in a file
joblib.dump(knn1, 'Selected_Model.pkl')

['Selected_Model.pkl']