# Partitioning

We will use two different datasets:
1. Contagio dataset: `Contagio` folder
2. CIC Evasive PDF Malware 2022: `CIC` folder

Load the libraries:

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [3]:
clean_9000 = './Contagio/result_clean_pdf_9000_files.csv'
clean_109 = './Contagio/result_clean_pdf_109_embed_video.csv'
malw_173 = './Contagio/result_malware_pdf_cve_sorted_173_files.csv'
malw_10982 = './Contagio/result_malware_pdf_pre_04-2011_10982_files.csv'
cic_dataset = './CIC/PDFMalware2022.csv'
df_clean_9000 = pd.read_csv(clean_9000, sep=',')
df_clean_109 = pd.read_csv(clean_109, sep=',')
df_malw_173 = pd.read_csv(malw_173, sep=',')
df_malw_10982 = pd.read_csv(malw_10982, sep=',')
df_cic = pd.read_csv(cic_dataset, sep=',')

Data coming from the Contagio dataset need to be put togheter and the class label has to be added. Secondly we have to split the dataset into training and testing.

In [4]:
df_clean = pd.concat([df_clean_9000, df_clean_109], axis=0, ignore_index=True)
df_malw = pd.concat([df_malw_10982, df_malw_173], axis=0, ignore_index=True)
df_clean["malware"] = np.zeros(df_clean.shape[0], dtype='bool')
df_malw["malware"] = np.ones(df_malw.shape[0], dtype='bool')
df_contagio = pd.concat([df_clean, df_malw], axis=0, ignore_index=True)
df_contagio.rename(columns={'/Colors > 2^24': '/Colors'}, inplace=True)
X_tot = df_contagio.iloc[:,:-1]
y_tot = df_contagio.iloc[:,-1:]
X_train, X_test, y_train, y_test= train_test_split(X_tot, y_tot,
                                                    test_size= 0.2,
                                                    shuffle= True, #shuffle the data to avoid bias
                                                    stratify=df_contagio['malware'],
                                                    random_state= 0)
np.save('./Contagio/X_train.npy', X_train)
np.save('./Contagio/X_test.npy', X_test)
np.save('./Contagio/y_train.npy', y_train.values.ravel())
np.save('./Contagio/y_test.npy', y_test.values.ravel())
np.save('./Contagio/X_tot.npy', X_tot)
df_contagio.to_pickle('./Contagio/df_tot.pandas')

Data coming from CIC dataset are modified to be fully compatible with data coming from the Contagio dataset. This means that only the features coming from PDFId are kept.

In [5]:
df_cic.rename(columns={'Class':'malware'}, inplace=True)
df_cic.loc[df_cic['malware'] == 'Malicious', 'malware'] = True
df_cic.loc[df_cic['malware'] == 'Benign', 'malware'] = False
df_cic = df_cic.drop(columns=['Fine name','pdfsize','metadata size','pages','xref Length','title characters','isEncrypted','embedded files','images','text','header'])
# Drop rows with NaN values
df_cic = df_cic.dropna()
# Drop rows with negative values
df_cic = df_cic.drop(df_cic[df_cic['/Colors'] < 0].index)
# Replace nA(nB) into nB -> e.g. 1(1) with 1, 55(3) with 3
pattern = re.compile("[0-9]+\([0-9]+\)")
for col in df_cic.columns:
    df_cic[col] = df_cic[col].apply(lambda x: int(str(x).split("(")[1].strip(")")) if pattern.match(str(x)) else x)
# Discard duplicate rows
# df_cic = df_cic.drop_duplicates(subset=df_cic.columns.difference(['malware']))
# Convert all columns to Int (except 'malware' column)
for col in df_cic.columns:
    df_cic[col] = df_cic[col].astype(int) if col != 'malware' else df_cic[col].astype(bool)
# Build train and test sets
X_tot = df_cic.iloc[:,:-1]
y_tot = df_cic.iloc[:,-1:]
X_train, X_test, y_train, y_test= train_test_split(X_tot, y_tot,
                                                    test_size= 0.2,
                                                    shuffle= True, #shuffle the data to avoid bias
                                                    stratify=df_cic['malware'],
                                                    random_state= 0)
np.save('./CIC/X_train.npy', X_train)
np.save('./CIC/X_test.npy', X_test)
np.save('./CIC/y_train.npy', y_train.values.ravel())
np.save('./CIC/y_test.npy', y_test.values.ravel())
np.save('./CIC/X_tot.npy', X_tot)
df_cic.to_pickle('./CIC/df_tot.pandas')