In [11]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


In [12]:
df_mal = pd.read_csv("C:\\Users\\amogh\\OneDrive\\Documents\\GitHub\\InnovationProject\\data\\raw\\malware.csv")
df_benign= pd.read_csv("C:\\Users\\amogh\\OneDrive\\Documents\\GitHub\\InnovationProject\\data\\raw\\benign.csv")

In [13]:
df_mal["label"] = 1   # malware
df_benign["label"] = 0  # benign


In [14]:
df = pd.concat([df_mal, df_benign])


In [15]:
combined = df.sample(frac=1, random_state=42).reset_index(drop=True)
combined.sample(5)

Unnamed: 0,type,hash,malice,generic,trojan,ransomware,worm,backdoor,spyware,rootkit,encrypter,downloader,label
10743,0,b06008d06459302d9da4a6ae512bc678ecdcb20e8e5574...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3812,0,7230968208169f32697afe22a58dd045687f5c64c5c20b...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2334,0,9aeb66b7f94af52e74ed568e697a5a577ebf8ad58e55fe...,0.924528,0.418182,0.472727,0.0,0.018182,0.0,0.0,0.0,0.090909,0.0,1
8690,0,922fc59597bf515d542b5f928fceb674d3aba5c071d840...,0.956522,0.19802,0.386139,0.29703,0.0,0.0,0.0,0.0,0.118812,0.0,1
2275,0,580f72b131455f2e2ca1194d14692fe1f9899fa023dc9e...,0.890511,0.527027,0.472973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [16]:
#now to find out of any of the numeric columns will need scaling 
#Step1: seperte the categorical from the numeric columns
num_col=[]
for col in df.columns:
    if df[col].dtype in ['int64','float64']:
        num_col.append(col)
print(num_col)

['type', 'malice', 'generic', 'trojan', 'ransomware', 'worm', 'backdoor', 'spyware', 'rootkit', 'encrypter', 'downloader', 'label']


In [18]:
#Step2: Deciding which features need scaling
rows=[]
for col in num_col:
    minimum=df[col].min()
    maximum=df[col].max()
    rng=maximum - minimum
    mean=df[col].mean()
    std=df[col].std(ddof=0)
    if mean!=0:
        cv= std/mean #tells you how much the values are spread out from the average 
    else:
        if std !=0:
            cv = float("infinite") # the spread would be infinite if mean is 0 but variation still exists
        else:
            cv=0.0 

    uni=df[col].nunique()#cpunts the unique valus

    #check if binary
    if uni==2 and set(df[col].unique()).issubset({0,1}):
        is_binary=True
    else:
        is_binary=False

    #check if constant
    if uni==1:
        is_constant=True
    else:
        is_constant=False
    
    #appending the results for each column
    rows.append((col,rng,std,cv,is_binary,is_constant))


In [19]:
#Step3: Makinf a stat table for all the columns
stat= pd.DataFrame(rows,columns=["Feature_Name",'range','Standard_deviation','Control_Variance','Is_Binary','Is_constant'])
print(stat.head(10))

  Feature_Name     range  Standard_deviation  Control_Variance  Is_Binary  \
0         type  1.000000            0.371461          2.247046       True   
1       malice  0.981132            0.262522          0.329403      False   
2      generic  0.916667            0.139817          0.372926      False   
3       trojan  0.760870            0.153713          0.379204      False   
4   ransomware  0.307692            0.018392          4.019294      False   
5         worm  0.590000            0.018242          2.320262      False   
6     backdoor  0.290323            0.031946          2.984910      False   
7      spyware  0.021277            0.002168          7.863711      False   
8      rootkit  0.307692            0.025180          4.503935      False   
9    encrypter  0.312500            0.062841          0.959914      False   

   Is_constant  
0        False  
1        False  
2        False  
3        False  
4        False  
5        False  
6        False  
7        False  

In [20]:
#Step 4: Fing the median of the standard deviation for columns that are not binary and not constant
valid_values=[]
for i in range(len(stat)):
    is_bina= bool(stat.loc[i,"Is_Binary"])
    is_const= bool(stat.loc[i,"Is_constant"])
    current_std= float(stat.loc[i,"Standard_deviation"])
    if (is_bina is False) and (is_const is False):
        valid_values.append(current_std)
median_std= float(np.median(valid_values))
print("Median STD for non constant and non binary features:",median_std)
#this tells us a baseline for the std for the columns which will tell us which features are unusually small or large

Median STD for non constant and non binary features: 0.04739339451481228


In [21]:
recommend_scale_list=[]

for i in range(len(stat)):
    feat_name= stat.loc[i,"Feature_Name"]
    feat_range= stat.loc[i,"range"]
    feat_std= stat.loc[i,"Standard_deviation"]
    feat_cv= stat.loc[i,"Control_Variance"]
    is_bin= stat.loc[i,"Is_Binary"]
    is_consta= stat.loc[i,"Is_constant"]
    rec= False
    #code to find if it is recomended to scale it or not
    if(is_bin == False) and (is_consta == False):
        range_is_big= (feat_range>10000)
        std_is_big= (feat_std>100* median_std)
        if isinstance(feat_cv,float) or isinstance(feat_cv,int):
            cv_is_big= (feat_cv>1.5)
        else:
            cv_is_big=False
    
        if range_is_big or std_is_big or cv_is_big:
            rec=True
    recommend_scale_list.append(rec)

In [22]:
#adding the result to the created stats dataframe
stat["recommended_scale"]= recommend_scale_list
print(stat[["Feature_Name","recommended_scale"]].head(20))

   Feature_Name  recommended_scale
0          type              False
1        malice              False
2       generic              False
3        trojan              False
4    ransomware               True
5          worm               True
6      backdoor               True
7       spyware               True
8       rootkit               True
9     encrypter              False
10   downloader               True
11        label              False


In [24]:
#Now this tells me what numerical columns require standard scaling so i will apply standard scaling to these columns
sclalable= stat.loc[stat["recommended_scale"],"Feature_Name"].tolist()
scaler=StandardScaler()
df[sclalable]= scaler.fit_transform(df[sclalable])
print(df.sample(5))

      type                                               hash    malice  \
3313     0  f5ad44fd014d6bb2ab08463d114d0ea3f6149abc542e00...  0.924528   
9420     1  bbb7f497a39ba38e527591f7fed577f648f5832b7dbc36...  0.684211   
7445     0  21093c7239b97eb3c354847ac3450d5358d398f8168f16...  0.850000   
1289     0  9cf3a137622d89d8279a9bccefa765670fe31f4cbfb9be...  0.924528   
4254     0  330b8404aaaa3ced315b9dd233bec33b2631c97d2ce3cc...  0.954128   

       generic    trojan  ransomware      worm  backdoor   spyware   rootkit  \
3313  0.319149  0.574468     -0.2488  0.735397 -0.335018 -0.127166 -0.222028   
9420  0.571429  0.400000     -0.2488 -0.430986 -0.335018 -0.127166 -0.222028   
7445  0.483871  0.516129     -0.2488 -0.430986 -0.335018 -0.127166 -0.222028   
1289  0.333333  0.537037     -0.2488  0.584200 -0.335018 -0.127166 -0.222028   
4254  0.367347  0.510204     -0.2488  0.687790 -0.335018 -0.127166 -0.222028   

      encrypter  downloader  label  
3313   0.085106   -0.510465    

In [None]:
#Dropping hash column as it doesnt help in prediction
df=df.drop(columns=['hash'])


Unnamed: 0,type,malice,generic,trojan,ransomware,worm,backdoor,spyware,rootkit,encrypter,downloader,label
0,0,0.883721,0.428571,0.428571,1.304633,0.352157,-0.335018,-0.127166,-0.222028,0.085714,-0.299107,1
1,0,0.900763,0.527027,0.378378,1.220664,-0.430986,0.087992,-0.127166,-0.222028,0.040541,-0.310532,1
2,0,0.925373,0.178571,0.5,8.165629,0.221633,-0.335018,-0.127166,-0.222028,0.047619,1.07472,1
3,0,0.863636,0.453125,0.546875,-0.2488,-0.430986,-0.335018,-0.127166,-0.222028,0.0,-0.510465,1
4,0,0.903704,0.561644,0.438356,-0.2488,-0.430986,-0.335018,-0.127166,-0.222028,0.0,-0.510465,1


In [26]:

df.sample(5)

Unnamed: 0,type,malice,generic,trojan,ransomware,worm,backdoor,spyware,rootkit,encrypter,downloader,label
1299,0,0.893204,0.358491,0.471698,-0.2488,0.603354,-0.335018,-0.127166,-0.222028,0.09434,0.326991,1
4183,0,0.934579,0.333333,0.518519,-0.2488,0.5842,-0.335018,-0.127166,-0.222028,0.12963,-0.510465,1
508,0,0.0,0.0,0.0,-0.2488,-0.430986,-0.335018,-0.127166,-0.222028,0.0,-0.510465,0
4283,0,0.941176,0.405063,0.455696,0.43943,0.262938,-0.335018,-0.127166,-0.222028,0.113924,-0.510465,1
7425,0,0.917293,0.457143,0.314286,-0.2488,-0.430986,0.112164,-0.127166,-0.222028,0.214286,-0.510465,1


In [28]:
df.to_csv("Cleaned_combined_malware_dataset.csv")