Processing the data

In [1]:
from src.utils.split_data import train_val_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

In [2]:
#Load the data from the csv file
raw_df = pd.read_csv('../../../../../data/raw/urls/Spam.csv')

In [3]:
#Split the data into train, validation and test
train_set, val_set, test_set = train_val_test_split(raw_df)

In [4]:
#Split the inputs and targets
X_train = train_set.drop("URL_Type_obf_Type", axis=1)
y_train = train_set["URL_Type_obf_Type"].copy()

X_val = val_set.drop("URL_Type_obf_Type", axis=1)
y_val = val_set["URL_Type_obf_Type"].copy()

X_test = test_set.drop("URL_Type_obf_Type", axis=1)
y_test = test_set["URL_Type_obf_Type"].copy()

In [5]:
#Null values are padded
imputer = SimpleImputer(strategy='median')
X_train_padded = imputer.fit_transform(X_train)
X_val_padded = imputer.fit_transform(X_val)
X_test_padded = imputer.fit_transform(X_test)

In [6]:
# Transform the result to a Pandas DataFrame
X_train_padded = pd.DataFrame(X_train_padded, columns=X_train.columns, index=y_train.index)
X_val_padded = pd.DataFrame(X_val_padded, columns=X_val.columns, index=y_val.index)
X_test_padded = pd.DataFrame(X_test_padded, columns=X_test.columns, index=y_test.index)

In [7]:
#A random forest is trained for the selection of characteristics
clf_rnd = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train_padded, y_train)

In [9]:
# Evaluate model
y_train_pred = clf_rnd.predict(X_train_padded)
y_val_pred = clf_rnd.predict(X_val_padded)
y_test_pred = clf_rnd.predict(X_test_padded)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label='spam'))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label='spam'))
print("F1 score test:", f1_score(y_test_pred, y_test, pos_label='spam'))

F1 score train: 1.0
F1 score val: 0.9977908689248897
F1 score test: 0.9974025974025974


In [10]:
#Get the most important feactures
feature_importances = {name: score for name, score in zip(list(raw_df), clf_rnd.feature_importances_)}
feature_importances_sorted = pd.Series(feature_importances).sort_values(ascending=False)
feature_importances_sorted

tld                        0.145202
SymbolCount_Domain         0.123768
NumberofDotsinURL          0.099386
domain_token_count         0.092425
CharacterContinuityRate    0.073285
                             ...   
dld_filename               0.000006
dld_domain                 0.000000
executable                 0.000000
isPortEighty               0.000000
ISIpAddressInDomainName    0.000000
Length: 79, dtype: float64

In [12]:
#The most 15 important feacture
columns = list(feature_importances_sorted.head(14).index)
columns

['tld',
 'SymbolCount_Domain',
 'NumberofDotsinURL',
 'domain_token_count',
 'CharacterContinuityRate',
 'SymbolCount_URL',
 'ArgUrlRatio',
 'argPathRatio',
 'SymbolCount_FileName',
 'SymbolCount_Extension',
 'Extension_DigitCount',
 'Entropy_Extension',
 'SymbolCount_Afterpath',
 'domainlength']

In [18]:
#Reduce dataset to the most important feactures
X_train_reduced = X_train_padded[columns].copy()
X_val_reduced = X_val_padded[columns].copy()
X_test_reduced = X_test_padded[columns].copy()

column_to_exclude = 'Entropy_Extension'

X_train_reduced = X_train_reduced.drop(column_to_exclude, axis=1)
X_val_reduced = X_val_reduced.drop(column_to_exclude, axis=1)
X_test_reduced = X_test_reduced.drop(column_to_exclude, axis=1)

In [33]:
# raw_df['NumberRate_AfterPath'].value_counts()
# raw_df['NumberRate_AfterPath'][raw_df['URL_Type_obf_Type'] == 'benign'].value_counts()
# raw_df['NumberRate_AfterPath'].describe()
# raw_df['Entropy_URL'].value_counts()
# raw_df['LongestPathTokenLength'].describe()
# raw_df['URL_Letter_Count'].value_counts()
raw_df['SymbolCount_Afterpath'].value_counts()
# raw_df['argPathRatio'].describe()
# X_train_reduced


SymbolCount_Afterpath
-1     8024
 1     1954
 3     1292
 5      788
 7      709
 9      337
 11     306
 0      204
 8      177
 12     133
 2      131
 10     125
 6       93
 4       81
 13      53
 15      28
 14      26
 16       6
 18       6
 23       4
 20       1
 29       1
Name: count, dtype: int64

In [67]:
# The dataset is scaled
scaler = RobustScaler()
X_train_reduced_scaled = scaler.fit_transform(X_train_reduced.copy())
X_val_reduced_scaled = scaler.fit_transform(X_val_reduced.copy())
X_test_reduced_scaled = scaler.fit_transform(X_test_reduced.copy())

# Transform the result to a Pandas DataFrame
X_train_reduced_prep = pd.DataFrame(X_train_reduced_scaled, columns=X_train_reduced.columns, index=y_train.index)
X_val_reduced_prep = pd.DataFrame(X_val_reduced_scaled, columns=X_val_reduced.columns, index=y_val.index)
X_test_reduced_prep = pd.DataFrame(X_test_reduced_scaled, columns=X_test_reduced.columns, index=y_test.index)

In [68]:
y_train_num = y_train.map({'spam': 1, 'benign': 0})
y_val_num = y_val.map({'spam': 1, 'benign': 0})
y_test_num = y_test.map({'spam': 1, 'benign': 0})

In [71]:
#Save the data
np.savez('../../../../../data/processed/urls/spam/scaled_train_data', inputs=X_train_reduced_prep, targets=y_train_num)
np.savez('../../../../../data/processed/urls/spam/scaled_validation_data', inputs=X_val_reduced_prep, targets=y_val_num)
np.savez('../../../../../data/processed/urls/spam/scaled_test_data', inputs=X_test_reduced_prep, targets=y_test_num)

np.savez('../../../../../data/processed/urls/spam/train_data', inputs=X_train_reduced, targets=y_train_num)
np.savez('../../../../../data/processed/urls/spam/validation_data', inputs=X_val_reduced, targets=y_val_num)
np.savez('../../../../../data/processed/urls/spam/test_data', inputs=X_test_reduced, targets=y_test_num)