Processing the data

In [2]:
from src.utils.split_data import train_val_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

In [3]:
#Load the data from the csv file
raw_df = pd.read_csv('../../../../../data/raw/urls/Phishing.csv')

In [4]:
#Split the data into train, validation and test
train_set, val_set, test_set = train_val_test_split(raw_df)

In [5]:
#Split the inputs and targets
X_train = train_set.drop("URL_Type_obf_Type", axis=1)
y_train = train_set["URL_Type_obf_Type"].copy()

X_val = val_set.drop("URL_Type_obf_Type", axis=1)
y_val = val_set["URL_Type_obf_Type"].copy()

X_test = test_set.drop("URL_Type_obf_Type", axis=1)
y_test = test_set["URL_Type_obf_Type"].copy()

In [6]:
# Drop the feacture with infinite values
X_train = X_train.drop("argPathRatio", axis=1)
X_val = X_val.drop("argPathRatio", axis=1)
X_test = X_test.drop("argPathRatio", axis=1)

In [7]:
#Null values are padded
imputer = SimpleImputer(strategy='median')
X_train_padded = imputer.fit_transform(X_train)
X_val_padded = imputer.fit_transform(X_val)
X_test_padded = imputer.fit_transform(X_test)

In [8]:
# Transform the result to a Pandas DataFrame
X_train_padded = pd.DataFrame(X_train_padded, columns=X_train.columns, index=y_train.index)
X_val_padded = pd.DataFrame(X_val_padded, columns=X_val.columns, index=y_val.index)
X_test_padded = pd.DataFrame(X_test_padded, columns=X_test.columns, index=y_test.index)

In [9]:
#A random forest is trained for the selection of characteristics
clf_rnd = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train_padded, y_train)

In [27]:
# Evaluate model
y_train_pred = clf_rnd.predict(X_train_padded)
y_val_pred = clf_rnd.predict(X_val_padded)
y_test_pred = clf_rnd.predict(X_test_padded)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label='phishing'))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label='phishing'))
print("F1 score test:", f1_score(y_test_pred, y_test, pos_label='phishing'))

F1 score train: 1.0
F1 score val: 0.9811570247933884
F1 score test: 0.9824677472709228


In [11]:
#Get the most important feactures
feature_importances = {name: score for name, score in zip(list(raw_df), clf_rnd.feature_importances_)}
feature_importances_sorted = pd.Series(feature_importances).sort_values(ascending=False)
feature_importances_sorted.head()

domainUrlRatio     0.102543
tld                0.076584
domainlength       0.067253
SymbolCount_URL    0.064566
pathDomainRatio    0.055492
dtype: float64

In [12]:
#The most 10 important feacture
columns = list(feature_importances_sorted.head(10).index)
columns

['domainUrlRatio',
 'tld',
 'domainlength',
 'SymbolCount_URL',
 'pathDomainRatio',
 'isPortEighty',
 'domain_token_count',
 'pathurlRatio',
 'Query_LetterCount',
 'subDirLen']

In [13]:
#Reduce dataset to the most important feactures
X_train_reduced = X_train_padded[columns].copy()
X_val_reduced = X_val_padded[columns].copy()
X_test_reduced = X_test_padded[columns].copy()

In [84]:
# X_test_reduced['isPortEighty'].value_counts()
# raw_df['domain_token_count'].value_counts()
# raw_df['tld'].value_counts()
# raw_df['Query_LetterCount'].describe()
# X_test_reduced['subDirLen'].describe()
# raw_df.describe()
# raw_df['tld'][raw_df['URL_Type_obf_Type'] == 'phishing'].value_counts()

In [23]:
# The dataset is scaled
scaler = RobustScaler()
X_train_reduced_scaled = scaler.fit_transform(X_train_reduced.copy())
X_val_reduced_scaled = scaler.fit_transform(X_val_reduced.copy())
X_test_reduced_scaled = scaler.fit_transform(X_test_reduced.copy())

# Transform the result to a Pandas DataFrame
X_train_reduced_prep = pd.DataFrame(X_train_reduced_scaled, columns=X_train_reduced.columns, index=y_train.index)
X_val_reduced_prep = pd.DataFrame(X_val_reduced_scaled, columns=X_val_reduced.columns, index=y_val.index)
X_test_reduced_prep = pd.DataFrame(X_test_reduced_scaled, columns=X_test_reduced.columns, index=y_test.index)

In [24]:
#Transform the targets to numerics values
y_train_num = y_train.factorize()[0]
y_val_num = y_val.factorize()[0]
y_test_num = y_test.factorize()[0]

In [26]:
#Save the data
np.savez('../../../../../data/processed/urls/phishing/scaled_train_data', inputs=X_train_reduced_prep, targets=y_train_num)
np.savez('../../../../../data/processed/urls/phishing/scaled_validation_data', inputs=X_val_reduced_prep, targets=y_val_num)
np.savez('../../../../../data/processed/urls/phishing/scaled_test_data', inputs=X_test_reduced_prep, targets=y_test_num)

np.savez('../../../../../data/processed/urls/phishing/train_data', inputs=X_train_reduced, targets=y_train_num)
np.savez('../../../../../data/processed/urls/phishing/validation_data', inputs=X_val_reduced, targets=y_val_num)
np.savez('../../../../../data/processed/urls/phishing/test_data', inputs=X_test_reduced, targets=y_test_num)