# <font style="color:#008fff;">Machine Learning Modeling</font>
<hr>

In [10]:
import pandas as pd
import numpy as np
import time
import os
import sklearn
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
import sys
import random
import pickle

#Disabling Warnings
warnings.filterwarnings('ignore')

# to make this notebook's output stable across runs
random.seed(42)

## <font style="color:#008fff;">Reading in preprocessed dataset</font>

In [2]:
def loadDataset(file_name):
    df = pd.read_csv(file_name)
    return df

start_time= time.time()
df_train_preprocessed = loadDataset("Dataset/preprocessed_data.csv")
print("***Elapsed time to read csv files --- %s seconds ---***" % (time.time() - start_time))

***Elapsed time to read csv files --- 0.04900360107421875 seconds ---***


In [3]:
df_train_preprocessed.head(10)

Unnamed: 0,url_vect,is_gov_tld,who_is,https,profanity_score_prob,url_len_scaled,js_len_scaled,js_obf_len_scaled,label
0,0.044,0,0,0,0.90178,0.152542,0.842993,0.663632,1
1,0.005,0,1,1,0.001813,0.59322,0.097764,0.0,0
2,0.033,0,0,0,0.965517,0.864407,0.727081,0.471829,1
3,0.046,0,1,1,0.049674,0.271186,0.038052,0.0,0
4,0.046,0,0,0,0.955261,0.525424,0.600632,0.421721,1
5,0.017,0,1,1,0.006906,0.745763,0.155719,0.0,0
6,0.046,0,0,0,0.925228,0.186441,0.492097,0.240813,1
7,0.524,0,0,0,0.989416,0.237288,0.393045,0.0,1
8,0.046,0,0,0,0.896943,0.254237,0.71549,0.319687,1
9,0.046,0,0,1,0.007933,0.271186,0.128205,0.0,0


### Split our dataset into X_train and y_train

In [9]:
X_train = df_train_preprocessed.drop('label', axis=1)
y_train = df_train_preprocessed['label']

In [25]:
X_train.shape, y_train.shape

((54506, 8), (54506,))

## <font style="color:#008fff;">Feature Selection</font>

### Feature selection using Sklearn's chi-squared:

In [28]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Taking the top 5 most correlated descriptive feature with the label using chi-squared testing
chi2_features = SelectKBest(chi2, k=5)
X_kbest_features = chi2_features.fit_transform(X_train, y_train)

In [32]:
print(f'The original preprocessed dataset has {X_train.shape[1]} features')
print(f'After feature selection using chi-squared, we decide to use the top {X_kbest_features.shape[1]} features')

The original preprocessed dataset has 8 features
After feature selection using chi-squared, we decide to use the top 5 features


In [45]:
selected_features = chi2_features.get_support(indices=True)
print(f'The top 5 features selected: {list(X_train.iloc[:, selected_features].columns)}')

The top 5 features selected: ['who_is', 'https', 'profanity_score_prob', 'js_len_scaled', 'js_obf_len_scaled']


**Other Potential Feature Selection methods to experiment in the future: Mean Absolute Difference, Fisher Score, different method from scratch**

## <font style="color:#008fff;">ML Modeling: K-Nearest Neighbors</font>

### Building KNN Model for the FULL feature set (X_train):

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn_full = KNeighborsClassifier(n_neighbors=3)
knn_full.fit(X_train.values, y_train.values)

In [14]:
# Save the model to disk
knn_full_filename = 'Models/knn_full_features.sav'
pickle.dump(knn_full, open(knn_full_filename, 'wb'))

In [16]:
# load the model from disk
knn_full = pickle.load(open(knn_full_filename, 'rb'))

### Building KNN Model for the feature set after FEATURE SELECTION (X_kbest_features):

In [50]:
# KNN with reduced features
knn_reduced = KNeighborsClassifier(n_neighbors=3)
knn_reduced.fit(X_kbest_features, y_train.values)

In [51]:
# Save the model to disk
knn_reduced_filename = 'Models/knn_reduced_features.sav'
pickle.dump(knn_reduced, open(knn_reduced_filename, 'wb'))

In [52]:
# load the model from disk
knn_reduced = pickle.load(open(knn_full_filename, 'rb'))

## <font style="color:#008fff;">(Gaussian) Naive Bayes</font>

### Building Naive Bayes Model for the FULL feature set:

In [54]:
from sklearn.naive_bayes import GaussianNB

gnb_full = GaussianNB()
gnb_full.fit(X_train.values, y_train.values)

In [55]:
# Save the model to disk
gnb_full_filename = 'Models/gnb_full_features.sav'
pickle.dump(gnb_full, open(gnb_full_filename, 'wb'))

In [56]:
# load the model from disk
gnb_full = pickle.load(open(gnb_full_filename, 'rb'))