In [None]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
import plotly.express as px
import scipy.stats
from matplotlib import pyplot as plt
from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D

import pickle

# Torch
import torch
from torchvision import datasets, transforms
from torch.utils.data import Dataset

#Sklearn
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

import sys
sys.path.append('../../src')
from score_model import *

Settings

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)


np.random.seed(1)

### Loading dataset

In [None]:
ds_original = pd.read_csv('..\data\external\dataset_phishing.csv', header=0, delimiter=',',decimal=',')

print("Original dataset dimensions:", ds_original.shape)

We will try to predict the status of an url

In [None]:
y = ds_original[:]["status"] 
x = ds_original.drop(columns = "status")

print("Number of samples:", x.shape[0])
print("Number of features:", x.shape[1])

## Data preprocessing

### - Data cleaning

Searching for empty or unknown values in the dataset

In [None]:
print("Null values:", x.isnull().sum().sum())
print("NA values:", x.isna().sum().sum())

print("\nColumns with values equal to -1:\n", ((x == -1).sum())[((x == -1).sum() != 0).tolist()])

Replacing unknown values with the mean of the known values

In [None]:
known_domain_age = x.loc[x['domain_age'] != -1, 'domain_age']
mean_domain_age = known_domain_age.sum()/len(known_domain_age)
mean_domain_age

x.loc[x["domain_age"] == -1, 'domain_age'] = mean_domain_age

known_domain_registration_length = x.loc[x['domain_registration_length'] != -1, 'domain_registration_length']
mean_domain_registration_length = known_domain_registration_length.sum()/len(known_domain_registration_length)
mean_domain_registration_length

x.loc[x["domain_registration_length"] == -1, 'domain_registration_length'] = mean_domain_registration_length

In [None]:
(x[["domain_age", "domain_registration_length"]] == -1).sum()

The "url" feature is not useful since all the information that can be extracted from it is already collected in the rest of columns.

In [None]:
x = x.drop(columns = "url")

### - Data conversion

Since we are trying to predict the status ($Y$ vector), we need to convert the _Phishing_ and _Legitimate_ labels to $1$ and $0$.

In [None]:
print(y.describe())

status_labels = y.copy() # "string" labels

y = y.replace({"phishing" : 1, "legitimate" : 0})

print("\nY content:", set(y))
print("\nY shape:", y.shape)
print("\nX shape:", x.shape)

The $X$ columns with float numbers are actually stored as an unknown _object_ variable. A conversion to float type is needed

In [None]:
type_of_cols = [x.dtypes == object][0]
for col in range(x.shape[1]):
    if type_of_cols[col]:
        x[x.columns[col]] = x[x.columns[col]].astype('float')

### - Data normalization

First we have to determine which features are categorical.

In [None]:
categorical_features = ["ip", "http_in_path", "https_token", "punycode", "port", "tld_in_path", "tld_in_subdomain", "abnormal_subdomain", "prefix_suffix", "random_domain", "shortening_service", "path_extension",
"domain_in_brand", "brand_in_subdomain", "brand_in_path", "suspecious_tld", "login_form", "external_favicon", "submit_email", "sfh", "iframe", "popup_window", "onmouseover", 
"right_clic", "empty_title", "domain_in_title", "domain_with_copyright", "whois_registered_domain", "dns_record", "google_index"]

non_categorical_features = [i for i in x.columns if (i not in categorical_features)]

x_non_categorical = x[non_categorical_features]
x_categorical = x[categorical_features]

In [None]:
scaler = StandardScaler() # Standardize features by removing the mean and scaling to unit variance.
#scaler = MinMaxScaler()

x_non_categorical_standard = scaler.fit_transform(x_non_categorical)

df = pd.DataFrame(x_non_categorical_standard)
df.columns = non_categorical_features

x_standard = pd.concat([df, x_categorical], axis = 1)

## Feature selection

A good idea for discarding some columns would be to drop those whose mean is very close to 0. We can do this only with the features that represent some kind of counter because that means that most of the samples doesn't register that feature.

In [None]:
columns_to_drop = []

for col in range(x.shape[1]):
    name_of_feature = x.columns[col]
    if abs(np.mean(x[name_of_feature])) <= 0.01:
        columns_to_drop.append(name_of_feature)
        
x[columns_to_drop].describe()

Veient els màxims i mínims d'aquestes columnes queda clar que totes aquestes columnes indiquen un compte d'alguna cosa o són variables binaries i, en qualsevol cas, no donen suficient informació per ajudar en la classifició.

In [None]:
#x = x.drop(columns = columns_to_drop)
#x_standard = x_standard.drop(columns = columns_to_drop)

print("Number of samples:", x.shape[0])
print("Number of features:", x.shape[1])

In [None]:
def split_data(x, y, train_ratio=0.8):
    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)
    n_train = int(np.floor(x.shape[0]*train_ratio))
    indices_train = indices[:n_train]
    indices_val = indices[n_train:] 
    x_train = x[indices_train, :]
    y_train = y[indices_train]
    x_val = x[indices_val, :]
    y_val = y[indices_val]
    return x_train, y_train, x_val, y_val

# Dividim dades d'entrenament
x_train, y_train, x_val, y_val = split_data(x_standard.values, y)

r2_table = np.zeros((x_train.shape[1], 2))

for i in range(x_train.shape[1]):
    x_t = x_train[:,i] # seleccionem atribut i en conjunt de train
    x_v = x_val[:,i] # seleccionem atribut i en conjunt de val.
    x_t = np.reshape(x_t,(x_t.shape[0],1))
    x_v = np.reshape(x_v,(x_v.shape[0],1))

    regr = LinearRegression()
    regr.fit(x_t, y_train)
    
    r2 = r2_score(y_val, regr.predict(x_v))
    
    r2_table[i, 1] = r2
    r2_table[i, 0] = i

We create a reduced dataset with the best atributes

In [None]:
best_atributes = r2_table[r2_table[:, 1].argsort()[::-1]][:,0].astype('int')


reduced_dataset_standard = x_standard[x_standard.columns[best_atributes[:15]]]
reduced_dataset_standard = reduced_dataset_standard.assign(status = y.values)


reduced_dataset = x[x_standard.columns[best_atributes[:15]]]
reduced_dataset = reduced_dataset.assign(status = y.values)



X = reduced_dataset.drop(columns='status').values
X_standard = reduced_dataset_standard.drop(columns='status').values

Y = reduced_dataset.filter(['status']).values

Relació entre variables no categoriques

In [None]:
reduced_dataset.insert(reduced_dataset.shape[1], "Status", status_labels)
#relacio = sns.pairplot(reduced_dataset, hue = "Status", y_vars = ["status"], x_vars = list(reduced_dataset.columns[:-2]))
#relacio.fig.set_size_inches(15,2)
#relacio._legend.set_title("Status")
reduced_dataset.drop(columns = "Status")
reduced_dataset = reduced_dataset.drop(columns = "Status")

Correlació entre les variables independents i la dependent

In [None]:
correlacio = reduced_dataset.corr()

plt.figure()
ax = sns.heatmap(pd.DataFrame(correlacio["status"][:-1]), annot=True, linewidths=.5)

In [None]:
#correlacio = reduced_dataset.corr()

#plt.figure()
#ax = sns.heatmap(pd.DataFrame(correlacio), annot=True, linewidths=.5)

In [None]:
#relacio = sns.pairplot(reduced_dataset, hue = "status")

Utilities

In [None]:
def visualize_confusion_matrix(y_pred, y_real):
    # mostra la matriu de confusió
    cm = confusion_matrix(y_real, y_pred)
    plt.subplots(figsize=(10, 6))
    sns.heatmap(cm, annot = True, fmt = 'g')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

PCA Analysis

In [None]:
X_torch = torch.tensor(X_standard)
Y_torch = torch.tensor(Y)

In [None]:
V = torch.pca_lowrank(X_torch, q=None, center=False, niter=100)[2]
pca = torch.matmul(X_torch, V[:, :2])

pt = sns.scatterplot(x = pca[:, 0], y = pca[:, 1], hue = status_labels);
pt.set_xlim(-5,28);
pt.set_ylim(-13,4);

"Legitimate" status data

In [None]:
pt = sns.scatterplot(x = pca[reduced_dataset_standard["status"] == 0 , 0], y = pca[reduced_dataset_standard["status"] == 0 , 1]);
pt.set_xlim(-5,28);
pt.set_ylim(-13,4);

"Phishing" status data

In [None]:
pt = sns.scatterplot(x = pca[reduced_dataset_standard["status"] == 1 , 0], y = pca[reduced_dataset_standard["status"] == 1 , 1]);
pt.set_xlim(-5,28);
pt.set_ylim(-13,4);

3D PCA for visualization:

In [None]:
V = torch.pca_lowrank(X_torch, q=None, center=True, niter=3)[2]
pca = torch.matmul(X_torch, V[:, :3])

df = px.data.iris()
fig = px.scatter_3d(x = pca[:, 0], y = pca[:, 1], z = pca[:, 2], color = status_labels)
fig.show()

#### Logistic regression

Loading model and testing

In [None]:
score_torch_model('../models/logiReg.pth', reduced_dataset_standard)

Visualizing the best model

In [None]:
X_torch = torch.tensor(X_standard).float()
Y_torch = torch.tensor(Y)
#Load the saved logistic model
model = pickle.load(open(f'../models/logiReg.pth', 'rb'))

#Parameters from the linear model of the logistic regression
parameters = [param.data for name, param in model.named_parameters()]

#Evaluate each sample in the linear model
linear_model_values = []
preds = []
for sample in X_torch:
    params = parameters[0].numpy()
    bias = parameters[1].numpy()

    p = np.concatenate((bias.reshape(1,1), params.transpose()))        
    x = np.concatenate((np.ones((1, 1)), sample.reshape(sample.shape[0], 1)))

    linear_model_values.append(np.dot(x.T, p)[0][0])
    
    #Get the predictions for each sample
    preds.append(model(sample).detach().numpy()[0])


# Labeling samples by class (predicted and actual)
color_pred = np.array([1 if i >= 0.5 else 0 for i in preds])
color_real = Y.reshape(Y_torch.shape[0])

plt.figure()
plt.title("Actual labeling")
sns.scatterplot(x = linear_model_values, y=Y.T[0], hue=color_real)
plt.show()

plt.figure()
plt.title("Prediction")
sns.scatterplot(x=linear_model_values, y=preds, hue=color_real)
plt.show()

mal1 = [True if (v >= 0.5 and Y.T[0][i] == 0) else False for i,v in enumerate(preds)]
mal2 = [True if (v < 0.5 and Y.T[0][i] == 1) else False for i,v in enumerate(preds)]
mal = np.logical_or(mal1, mal2)

color_pred_mal = color_pred.copy()
color_pred_mal[mal] = 2

plt.figure()
plt.title("Prediction, wrong values in red")
sns.scatterplot(x=linear_model_values, y=preds, hue=color_pred_mal, palette = ["blue", "orange", "red"])
plt.show()

visualize_confusion_matrix(color_pred, color_real)

### SVM

In [None]:
score_sklearn_model("../models/modelLinear.sav", reduced_dataset_standard, True)

In [None]:
score_sklearn_model("../models/modelRbf.sav", reduced_dataset_standard)

In [None]:
score_sklearn_model("../models/modelSigmoid.sav", reduced_dataset_standard, True)

In [None]:
score_sklearn_model("../models/modelPoly1.sav", reduced_dataset_standard, True)

In [None]:
score_sklearn_model("../models/modelPoly2.sav", reduced_dataset_standard, True)

In [None]:
score_sklearn_model("../models/modelPoly3.sav", reduced_dataset_standard, True)