Processing the data

In [2]:
from src.utils.split_data import train_val_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

In [3]:
#Load the data from the csv file
raw_df = pd.read_csv('../../../../../data/raw/urls/Malware.csv')

In [4]:
#Split the data into train, validation and test
train_set, val_set, test_set = train_val_test_split(raw_df)

In [5]:
#Split the inputs and targets
X_train = train_set.drop("URL_Type_obf_Type", axis=1)
y_train = train_set["URL_Type_obf_Type"].copy()

X_val = val_set.drop("URL_Type_obf_Type", axis=1)
y_val = val_set["URL_Type_obf_Type"].copy()

X_test = test_set.drop("URL_Type_obf_Type", axis=1)
y_test = test_set["URL_Type_obf_Type"].copy()

In [6]:
# Drop the feacture with infinite values
X_train = X_train.drop("argPathRatio", axis=1)
X_val = X_val.drop("argPathRatio", axis=1)
X_test = X_test.drop("argPathRatio", axis=1)

In [7]:
#Null values are padded
imputer = SimpleImputer(strategy='median')
X_train_padded = imputer.fit_transform(X_train)
X_val_padded = imputer.fit_transform(X_val)
X_test_padded = imputer.fit_transform(X_test)

In [8]:
# Transform the result to a Pandas DataFrame
X_train_padded = pd.DataFrame(X_train_padded, columns=X_train.columns, index=y_train.index)
X_val_padded = pd.DataFrame(X_val_padded, columns=X_val.columns, index=y_val.index)
X_test_padded = pd.DataFrame(X_test_padded, columns=X_test.columns, index=y_test.index)

In [9]:
#A random forest is trained for the selection of characteristics
clf_rnd = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train_padded, y_train)

In [10]:
# Evaluate model
y_train_pred = clf_rnd.predict(X_train_padded)
y_val_pred = clf_rnd.predict(X_val_padded)
y_test_pred = clf_rnd.predict(X_test_padded)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label='malware'))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label='malware'))
print("F1 score test:", f1_score(y_test_pred, y_test, pos_label='malware'))

F1 score train: 1.0
F1 score val: 0.9926144756277696
F1 score test: 0.9944913698127067


In [11]:
#Get the most important feactures
feature_importances = {name: score for name, score in zip(list(raw_df), clf_rnd.feature_importances_)}
feature_importances_sorted = pd.Series(feature_importances).sort_values(ascending=False)
feature_importances_sorted

NumberRate_AfterPath    0.079855
isPortEighty            0.065570
domainlength            0.056257
SymbolCount_URL         0.053563
domain_token_count      0.042301
                          ...   
dld_getArg              0.000158
dld_filename            0.000113
executable              0.000026
dld_domain              0.000000
NumberofDotsinURL       0.000000
Length: 78, dtype: float64

In [12]:
#The most 15 important feacture
columns = list(feature_importances_sorted.head(15).index)
columns

['NumberRate_AfterPath',
 'isPortEighty',
 'domainlength',
 'SymbolCount_URL',
 'domain_token_count',
 'tld',
 'argDomanRatio',
 'ISIpAddressInDomainName',
 'delimeter_Domain',
 'Entropy_URL',
 'urlLen',
 'LongestPathTokenLength',
 'URL_Letter_Count',
 'avgdomaintokenlen',
 'pathDomainRatio']

In [13]:
#Reduce dataset to the most important feactures
X_train_reduced = X_train_padded[columns].copy()
X_val_reduced = X_val_padded[columns].copy()
X_test_reduced = X_test_padded[columns].copy()

columns_to_exclude = ['delimeter_Domain', 'Entropy_URL']

X_train_reduced = X_train_reduced.drop(columns_to_exclude, axis=1)
X_val_reduced = X_val_reduced.drop(columns_to_exclude, axis=1)
X_test_reduced = X_test_reduced.drop(columns_to_exclude, axis=1)

In [14]:
# raw_df['NumberRate_AfterPath'].value_counts()
# raw_df['NumberRate_AfterPath'][raw_df['URL_Type_obf_Type'] == 'benign'].value_counts()
# raw_df['NumberRate_AfterPath'].describe()
# raw_df['Entropy_URL'].value_counts()
# raw_df['LongestPathTokenLength'].describe()
# raw_df['URL_Letter_Count'].value_counts()
# raw_df['avgdomaintokenlen'].value_counts()
X_train_reduced

Unnamed: 0,NumberRate_AfterPath,isPortEighty,domainlength,SymbolCount_URL,domain_token_count,tld,argDomanRatio,ISIpAddressInDomainName,urlLen,LongestPathTokenLength,URL_Letter_Count,avgdomaintokenlen,pathDomainRatio
5559,-1.000000,-1.0,15.0,10.0,2.0,2.0,0.133333,-1.0,56.0,6.0,39.0,7.000000,2.266667
2835,-1.000000,-1.0,9.0,5.0,2.0,2.0,0.222222,-1.0,103.0,76.0,63.0,4.000000,9.666667
4700,-1.000000,-1.0,13.0,5.0,2.0,2.0,0.153846,-1.0,105.0,67.0,82.0,6.000000,6.538462
2445,0.086957,-1.0,15.0,7.0,2.0,2.0,1.533333,-1.0,55.0,32.0,36.0,7.000000,2.200000
2233,-1.000000,-1.0,9.0,8.0,2.0,2.0,0.222222,-1.0,73.0,30.0,51.0,4.000000,6.333334
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,-1.000000,-1.0,14.0,9.0,2.0,2.0,0.142857,-1.0,120.0,77.0,92.0,6.500000,7.071429
13418,0.000000,-1.0,13.0,11.0,3.0,3.0,1.384615,-1.0,66.0,22.0,50.0,3.666667,3.538461
5390,-1.000000,-1.0,9.0,8.0,2.0,2.0,0.222222,-1.0,57.0,9.0,46.0,4.000000,4.555555
860,-1.000000,-1.0,7.0,5.0,2.0,2.0,0.285714,-1.0,96.0,65.0,71.0,3.000000,11.714286


In [15]:
# The dataset is scaled
scaler = RobustScaler()
X_train_reduced_scaled = scaler.fit_transform(X_train_reduced.copy())
X_val_reduced_scaled = scaler.fit_transform(X_val_reduced.copy())
X_test_reduced_scaled = scaler.fit_transform(X_test_reduced.copy())

# Transform the result to a Pandas DataFrame
X_train_reduced_prep = pd.DataFrame(X_train_reduced_scaled, columns=X_train_reduced.columns, index=y_train.index)
X_val_reduced_prep = pd.DataFrame(X_val_reduced_scaled, columns=X_val_reduced.columns, index=y_val.index)
X_test_reduced_prep = pd.DataFrame(X_test_reduced_scaled, columns=X_test_reduced.columns, index=y_test.index)

In [16]:
#Transform the targets to numerics values
y_train_num = y_train.map({'malware': 1, 'benign': 0})
y_val_num = y_val.map({'malware': 1, 'benign': 0})
y_test_num = y_test.map({'malware': 1, 'benign': 0})

In [17]:
#Save the data
np.savez('../../../../../data/processed/urls/malware/scaled_train_data', inputs=X_train_reduced_prep, targets=y_train_num)
np.savez('../../../../../data/processed/urls/malware/scaled_validation_data', inputs=X_val_reduced_prep, targets=y_val_num)
np.savez('../../../../../data/processed/urls/malware/scaled_test_data', inputs=X_test_reduced_prep, targets=y_test_num)

np.savez('../../../../../data/processed/urls/malware/train_data', inputs=X_train_reduced, targets=y_train_num)
np.savez('../../../../../data/processed/urls/malware/validation_data', inputs=X_val_reduced, targets=y_val_num)
np.savez('../../../../../data/processed/urls/malware/test_data', inputs=X_test_reduced, targets=y_test_num)