In [2]:
from sklearn.datasets import make_regression
import numpy as np
import pandas as pd
#matplotlib notebook
from matplotlib import pyplot as plt
import scipy.stats
import math
import seaborn as sns

# Visualitzarem només 3 decimals per mostra
pd.set_option('display.float_format', lambda x: '%.3f' % x)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [3]:
ds_original = pd.read_csv('dataset_phishing.csv', header=0, delimiter=',',decimal=',')

print("Original dataset dimensions:", ds_original.shape)

#Podem prescindir de la columna de strings amb la url perquè toda la informació que ens pot donar ja queda reflexada en la resta de característiques
dataset = ds_original.drop(columns = "url")

Original dataset dimensions: (11430, 89)


In [4]:
# L' objectiu és predir si una url és de phishing o no, per tant la variable dependent serà STATUS.
y = dataset[:]["status"] 
x = dataset.drop(columns = "status")

print("Number of samples:", x.shape[0])
print("Number of features:", x.shape[1])

Number of samples: 11430
Number of features: 87


In [5]:
print(y.describe())
y = y.replace({"phishing" : 1, "legitimate" : 0})

print("\nContingut de Y:", set(y))

print("\nDimensionalitat de X:", x.shape)

count        11430
unique           2
top       phishing
freq          5715
Name: status, dtype: object

Contingut de Y: {0, 1}

Dimensionalitat de X: (11430, 87)


Incialment a $Y$ teniem les etiques de "phishing" i "legitimate". Per poder fer regressió ens interessa convertir-les en un valor binari on 1 indica _Phishing_ i 0 no.
També veïem que les mostres estan repartides al 50% entre les dues categoríes (Com s'indica a la descripció de Kaggle).

Les columnes de $X$ que tenen nombres decimals apareixen amb un tipus de dada desconegut anomenat _object_. Aixó pot crear problemes més endavant, per tant es bona idea fer una conversió a tipus _float_.

In [6]:
type_of_cols = [x.dtypes == object][0]
for col in range(x.shape[1]):
    if type_of_cols[col]:
        x[x.columns[col]] = x[x.columns[col]].astype('float')

Now $X$ contain just numeric values.

A good idea for discarding some columns would be to drop those whose mean is or is very close to 0. We can do this only with the features that represent some kind of counter because that means that most of the samples doesn't register that feature.

In [7]:
columns_to_drop = []

for col in range(x.shape[1]):
    name_of_feature = x.columns[col]
    if np.mean(x[name_of_feature]) <= 0.01:
        columns_to_drop.append(name_of_feature)
        
x[columns_to_drop].describe()

Unnamed: 0,nb_or,nb_tilde,nb_star,nb_comma,nb_dollar,nb_dslash,punycode,port,path_extension,nb_external_redirection,brand_in_subdomain,brand_in_path,ratio_nullHyperlinks,ratio_intRedirection,ratio_intErrors,submit_email,sfh,iframe,popup_window,onmouseover,right_clic
count,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0
mean,0.0,0.007,0.001,0.004,0.002,0.007,0.0,0.002,0.0,0.003,0.004,0.005,0.0,0.0,0.0,0.0,0.0,0.001,0.006,0.001,0.001
std,0.0,0.081,0.026,0.103,0.077,0.081,0.019,0.049,0.013,0.056,0.064,0.07,0.0,0.0,0.0,0.0,0.0,0.036,0.077,0.034,0.037
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,1.0,1.0,4.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0


In [8]:
x = x.drop(columns = columns_to_drop)
x.shape

(11430, 66)

Veient els màxims i mínims d'aquestes columnes queda clar que totes aquestes columnes indiquen un compte d'alguna cosa o són variables binaries i, en qualsevol cas, no donen suficient informació per ajudar en la classifició. 

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

def mse(v1, v2):
    return ((v1 - v2)**2).mean()


def regression(x, y):
    # Creem un objecte de regressió de sklearn
    regr = LinearRegression()

    # Entrenem el model per a predir y a partir de x
    regr.fit(x, y)

    # Retornem el model entrenat
    return regr

In [42]:
def split_data(x, y, train_ratio=0.8):
    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)
    n_train = int(np.floor(x.shape[0]*train_ratio))
    indices_train = indices[:n_train]
    indices_val = indices[n_train:] 
    x_train = x[indices_train, :]
    y_train = y[indices_train]
    x_val = x[indices_val, :]
    y_val = y[indices_val]
    return x_train, y_train, x_val, y_val

# Dividim dades d'entrenament
x_train, y_train, x_val, y_val = split_data(x.values, y)

r2_table = np.zeros((x_train.shape[1], 2))

for i in range(x_train.shape[1]):
    x_t = x_train[:,i] # seleccionem atribut i en conjunt de train
    x_v = x_val[:,i] # seleccionem atribut i en conjunt de val.
    x_t = np.reshape(x_t,(x_t.shape[0],1))
    x_v = np.reshape(x_v,(x_v.shape[0],1))

    regr = regression(x_t, y_train)    
    error = mse(y_val, regr.predict(x_v)) # calculem error
    r2 = r2_score(y_val, regr.predict(x_v))
    
    r2_table[i, 1] = r2
    r2_table[i, 0] = i
    
r2_table[r2_table[:, 1].argsort()]

array([[ 4.70000000e+01, -7.20062871e-03],
       [ 5.00000000e+01, -5.66725060e-04],
       [ 2.60000000e+01, -3.77403876e-04],
       [ 1.40000000e+01, -3.29277670e-04],
       [ 5.90000000e+01, -9.39189974e-05],
       [ 3.00000000e+01,  1.70163172e-04],
       [ 6.20000000e+01,  6.30959778e-04],
       [ 3.10000000e+01,  6.60855559e-04],
       [ 9.00000000e+00,  8.12424876e-04],
       [ 4.90000000e+01,  1.00357019e-03],
       [ 1.00000000e+01,  1.19586644e-03],
       [ 2.80000000e+01,  1.25703022e-03],
       [ 3.30000000e+01,  3.31684249e-03],
       [ 1.70000000e+01,  4.04432560e-03],
       [ 2.10000000e+01,  6.64875989e-03],
       [ 4.00000000e+00,  8.45737091e-03],
       [ 1.20000000e+01,  9.78089529e-03],
       [ 4.60000000e+01,  9.79505699e-03],
       [ 2.40000000e+01,  9.92607061e-03],
       [ 2.30000000e+01,  1.10990818e-02],
       [ 5.40000000e+01,  1.12634907e-02],
       [ 1.30000000e+01,  1.14468303e-02],
       [ 4.10000000e+01,  1.14701592e-02],
       [ 2.