# Information Gain
for reducing number of features in dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('scdataset_edited.csv') 
df.head()

Unnamed: 0.1,Unnamed: 0,label,bytecode_len,Weight bytecode_character_6,Weight bytecode_character_0,Weight bytecode_character_8,Weight bytecode_character_4,Weight bytecode_character_5,Weight bytecode_character_2,Weight bytecode_character_1,...,Weight bytecode_character_P,bytecode_character_k,bytecode_character_P,Weight bytecode_character_g,bytecode_character_g,Weight bytecode_character_I,Weight bytecode_character_m,bytecode_character_I,bytecode_character_m,Weight bytecode_character_x
0,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,117,1,8370,0.059498,0.151732,0.036201,0.013381,0.055078,0.039904,0.069534,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,120,1,2586,0.069606,0.176334,0.036736,0.030936,0.064192,0.020495,0.080046,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,193,1,7002,0.068695,0.18409,0.048272,0.017566,0.069123,0.043559,0.088546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Function to calculate entropy
def entropy(y):
    value_counts = np.bincount(y)
    probabilities = value_counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-9))  

In [4]:
# Function to calculate information gain
def information_gain(data, feature, label):
    # Calculate total entropy before the split
    total_entropy = entropy(data[label])
    
    # Calculate values and their probabilities for the feature
    values = data[feature].unique()
    weighted_entropy = 0
    
    for value in values:
        subset = data[data[feature] == value]
        subset_entropy = entropy(subset[label])
        weighted_entropy += (len(subset) / len(data)) * subset_entropy
    
    # Information Gain is total entropy minus weighted entropy
    return total_entropy - weighted_entropy

In [5]:
# Step 2: Calculate Information Gain for each feature
label_column = 'label'  
features = df.columns.drop(label_column)

info_gains = {feature: information_gain(df, feature, label_column) for feature in features}

In [6]:
# Step 3: Sort features by Information Gain and select features above a threshold
threshold = 0.1  # Set your threshold for selecting features
selected_features = [feature for feature, gain in info_gains.items() if gain > threshold]

In [7]:
IG_df = df[[label_column] + selected_features]  

In [8]:
print(IG_df)

       label  Unnamed: 0  bytecode_len  Weight bytecode_character_6  \
0          1           1             0                     0.000000   
1          1           2             0                     0.000000   
2          1         117          8370                     0.059498   
3          1         120          2586                     0.069606   
4          1         193          7002                     0.068695   
...      ...         ...           ...                          ...   
36666      0       36590          8372                     0.059603   
36667      0       36608           240                     0.075000   
36668      0       36610          1074                     0.081006   
36669      0       36646          1074                     0.082868   
36670      0       36659          6774                     0.064068   

       Weight bytecode_character_0  Weight bytecode_character_8  \
0                         0.000000                     0.000000   
1            

In [9]:
IG_df.shape

(36671, 76)

In [10]:
IG_df['label'].value_counts()

label
0    26915
1     9756
Name: count, dtype: int64

In [11]:
x = IG_df.drop('label', axis=1)
y = IG_df['label']

# PCA

In [12]:
from sklearn.decomposition import PCA
pca = PCA(n_components=70)  
X_pca = pca.fit_transform(x)

In [13]:
pca_df = pd.DataFrame(data=X_pca, columns=x.columns[:70]) 
pca_df.insert(0, 'label', y.values)

In [14]:
pca_df.to_csv('scdataset_igthenpca.csv', index=False)

# SMOTE function
increasing datas with label 1 (vulnerable  datas) from 10000 to 26000

In [15]:
import pandas as pd
from imblearn.over_sampling import SMOTE

In [16]:
df_igthenpca=pd.read_csv('scdataset_igthenpca.csv') 

In [17]:
X = df_igthenpca.drop(columns=['label'])  
y = df_igthenpca['label']   

In [18]:
desired_count = 26000  
current_count = sum(y == 1)  
sampling_strategy = {1: desired_count}  

In [19]:
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [20]:
augmented_df = pd.DataFrame(X_resampled, columns=X.columns)
augmented_df['label'] = y_resampled

In [21]:
augmented_df = augmented_df[['label'] + [col for col in augmented_df.columns if col != 'label']]

In [22]:
augmented_df.to_csv('scdataset_igthenpca_augmented.csv', index=False)

In [23]:
smote_df = pd.read_csv('./scdataset_igthenpca_augmented.csv')
smote_df.head()

Unnamed: 0.1,label,Unnamed: 0,bytecode_len,Weight bytecode_character_6,Weight bytecode_character_0,Weight bytecode_character_8,Weight bytecode_character_4,Weight bytecode_character_5,Weight bytecode_character_2,Weight bytecode_character_1,...,Opcode weight DUP5,Opcode weight DUP6,Opcode weight DUP7,Opcode weight DUP8,Opcode weight DUP9,Opcode weight SWAP1,Opcode weight SWAP2,Opcode weight SWAP3,Opcode weight SWAP4,Opcode weight LOG1
0,1,-18584.541122,-3768.024353,-139.440572,-87.3025,12.654108,-1.993456,-0.025376,0.006328,-0.00111,...,-6e-06,2e-06,-1.2e-05,7e-06,4.6e-05,6e-06,-2e-06,4.092177e-07,-4.6e-05,3e-06
1,1,-18583.757936,-3768.362461,-153.432506,-88.008081,12.670094,-1.991639,-0.025707,0.006324,-0.001117,...,-7e-06,3e-06,-1.3e-05,7e-06,4.6e-05,7e-06,-3e-06,2.718765e-07,-4.6e-05,3e-06
2,1,-17919.360735,5450.035355,1095.660653,1282.119285,102.029856,0.037189,0.095841,0.041233,0.009434,...,-0.000127,0.000148,8e-06,3e-06,4e-06,-4.8e-05,0.000103,7.515686e-05,-6.2e-05,7.8e-05
3,1,-18289.524987,-971.859776,915.011112,123.200352,-6.563664,0.785214,0.189752,0.030554,-0.03311,...,0.000321,9e-06,-0.000353,0.000243,0.001821,-0.000328,0.001025,0.001108894,0.000193,0.001667
4,1,-17951.709027,3753.622539,427.4518,278.449355,-19.401829,0.466487,0.092458,0.021495,0.006596,...,0.000395,-0.000592,2.1e-05,0.000605,-0.000448,-0.000108,0.000144,0.0003200167,1.4e-05,0.00015


In [24]:
print(smote_df)

       label    Unnamed: 0  bytecode_len  Weight bytecode_character_6  \
0          1 -18584.541122  -3768.024353                  -139.440572   
1          1 -18583.757936  -3768.362461                  -153.432506   
2          1 -17919.360735   5450.035355                  1095.660653   
3          1 -18289.524987   -971.859776                   915.011112   
4          1 -17951.709027   3753.622539                   427.451800   
...      ...           ...           ...                          ...   
52910      1  15152.323216  -3977.201862                  -622.266277   
52911      1  13528.703087  -5347.820464                   436.060097   
52912      1  14564.367943  -5278.885470                   672.199065   
52913      1  15489.574926  -3994.678260                  -567.860779   
52914      1  16100.423206  13157.346995                  -244.214698   

       Weight bytecode_character_0  Weight bytecode_character_8  \
0                       -87.302500                    12

In [25]:
smote_df.shape

(52915, 71)

In [26]:
smote_df['label'].value_counts()

label
0    26915
1    26000
Name: count, dtype: int64