In [66]:
#importing libraries
import pandas as pd
import numpy as np
import os 
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score


In [67]:
#loading in dataset
filename = os.path.join(os.getcwd(), "file.csv")
df = pd.read_csv(filename, header=0)

In [68]:
#viewing dataset
df.head(10)

Unnamed: 0.1,Unnamed: 0,url_length,content_length,mal,charsets,server,special_chars
0,0,16,-1.0,1,-1,-1,3
1,1,35,-1.0,0,-1,-1,5
2,2,31,-1.0,0,-1,-1,5
3,3,88,-1.0,1,-1,-1,18
4,4,235,-1.0,1,-1,-1,14
5,5,118,-1.0,0,-1,-1,24
6,6,45,-1.0,0,-1,-1,10
7,7,46,-1.0,0,-1,-1,6
8,8,44,-1.0,1,-1,-1,7
9,9,45,-1.0,0,-1,-1,7


In [69]:
#null values?
df.isnull().sum()

Unnamed: 0         0
url_length         0
content_length    71
mal                0
charsets           0
server            31
special_chars      0
dtype: int64

In [70]:
#duplicates?
df.duplicated().sum()

0

In [71]:
#data types
df.dtypes

Unnamed: 0          int64
url_length          int64
content_length    float64
mal                 int64
charsets           object
server             object
special_chars       int64
dtype: object

In [72]:
df.shape

(1800, 7)

In [73]:
#unique values
df.nunique()

Unnamed: 0        1800
url_length         177
content_length      50
mal                  2
charsets             8
server              25
special_chars       42
dtype: int64

In [74]:
#filling null values
df['content_length'] = df['content_length'].fillna(0)
df['server'] = df['server'].fillna(0)

In [75]:
#sanity check
df.isnull().sum()

Unnamed: 0        0
url_length        0
content_length    0
mal               0
charsets          0
server            0
special_chars     0
dtype: int64

In [76]:
#dropping columns and one hot encoding
#Try normailzation and tokenization for URL instead of just droppping(note to self)
df = pd.get_dummies(df, columns=['server'], drop_first=True)
df = pd.get_dummies(df, columns=['charsets'], drop_first=True)

In [77]:
#sanity check
df.dtypes

Unnamed: 0                         int64
url_length                         int64
content_length                   float64
mal                                int64
special_chars                      int64
server_-1                           bool
server_ATS                          bool
server_AmazonS3                     bool
server_Apache                       bool
server_Apache/2                     bool
server_Apache/2.4.52 (Ubuntu)       bool
server_Apache/2.4.56 (Debian)       bool
server_Apache/2.4.6                 bool
server_GSE                          bool
server_LiteSpeed                    bool
server_MerlinCDN                    bool
server_Microsoft-IIS/8.5            bool
server_Universe                     bool
server_cloudflare                   bool
server_ddos-guard                   bool
server_gogogadgeto-server           bool
server_gunicorn/0.17.2              bool
server_nginx                        bool
server_nginx/1.14.0 (Ubuntu)        bool
server_nginx/1.1

In [78]:
#creating X and y; now train test split
y = df['mal'] 
X = df.drop(columns = 'mal', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1234)

In [79]:
# Creating model
rf_model = RandomForestClassifier()

# Fitting the model 
rf_model.fit(X_train, y_train)

# Predicting values for test dataset
y_pred = rf_model.predict(X_test)

# Evaluating the model
rf_rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Mean squared error: %f" % rf_rmse)

rf_r2 = r2_score(y_test, y_pred)
print("R squared: %f" % rf_r2)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

# predicted probabilities
y_pred_proba = rf_model.predict_proba(X_test)

# only true values
y_pred_proba = y_pred_proba[:,1]

# Compute auc score
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)


report = classification_report(y_test, y_pred, zero_division=1)
print(report)

Mean squared error: 0.445138
R squared: 0.042680
Accuracy Score: 0.801852
Precision Score: 0.714286
Recall Score: 0.537975
F1 Score: 0.613718
AUC Score: 0.831500
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       382
           1       0.71      0.54      0.61       158

    accuracy                           0.80       540
   macro avg       0.77      0.72      0.74       540
weighted avg       0.79      0.80      0.79       540

