In [None]:
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
import matplotlib.pyplot as plt

# Data
This data was obtained from https://www.kaggle.com/c/malware-detection<br>
To evaluate this workbook you will need access to the data. Login or create an account with kaggle (or login via google) to access link. Accept the terms and conditions of the competion and click the `Download All` button in the `Data` tab.<br>
Place the zip file in the root folder of this repository. (The next cell will extract it into the correct location)
<br><br>
__Data Description__ _from Kaggle_<br>
The raw data here was obtained from the malware security partner of Meraz'18 - Annual Techno Cultural festival of IIT Bhilai, the said raw data constituted malware and legitimate files.

Malware represents software which is specifically designed to disrupt, damage, or gain authorised access to a computer system. Legitimate files are software that don't behave like malware and are useful and harmless to the users.

Statistical analysis was done on these files which mainly constituted the extraction of PE information and calculation of entropy of different sections of these files.

More data might be made public as the competition progresses to incorporate newly discovered zero day viruses to check the robustness of your algorithm. Doing this will also help you get an experience of the pressure under which these anti-malware software giants like Max Secure Software work to provide uninterrupted protection.

In [None]:
# Check if the zip folder has been extracted and extract it if not
data_dir = Path("../data")
if not data_dir.exists():
    data_dir.mkdir()
    z_path = Path("../malware-detection.zip")
    ZipFile(file=z_path).extractall(path=data_dir, )

In [None]:
# This helps the dataframe load faster
dtypes = {
    'ID': int,
    'md5': str,
    'Machine': str,
    'SizeOfOptionalHeader': int,
    'Characteristics': int,
    'MajorLinkerVersion': float,
    'MinorLinkerVersion': int,
    'SizeOfCode': int,
    'SizeOfInitializedData': int,
    'SizeOfUninitializedData': int,
    'AddressOfEntryPoint': int,
    'BaseOfCode': int,
    'BaseOfData': int,
    'ImageBase': float,
    'SectionAlignment': int,
    'FileAlignment': int,
    'MajorOperatingSystemVersion': int,
    'MinorOperatingSystemVersion': int,
    'MajorImageVersion': int,
    'MinorImageVersion': int,
    'MajorSubsystemVersion': int,
    'MinorSubsystemVersion': int,
    'SizeOfImage': int,
    'SizeOfHeaders': int,
    'CheckSum': int,
    'Subsystem': int,
    'DllCharacteristics': int,
    'SizeOfStackReserve': int,
    'SizeOfStackCommit': int,
    'SizeOfHeapReserve': int,
    'SizeOfHeapCommit': int,
    'LoaderFlags': int,
    'NumberOfRvaAndSizes': int,
    'SectionsNb': int,
    'SectionsMeanEntropy': float,
    'SectionsMinEntropy': float,
    'SectionsMaxEntropy': float,
    'SectionsMeanRawsize': float,
    'SectionsMinRawsize': int,
    'SectionMaxRawsize': int,
    'SectionsMeanVirtualsize': float,
    'SectionsMinVirtualsize': float,
    'SectionMaxVirtualsize': int,
    'ImportsNbDLL': int,
    'ImportsNb': int,
    'ImportsNbOrdinal': int,
    'ExportNb': int,
    'ResourcesNb': int,
    'ResourcesMeanEntropy': float,
    'ResourcesMinEntropy': float,
    'ResourcesMaxEntropy': float,
    'ResourcesMeanSize': float,
    'ResourcesMinSize': float,
    'ResourcesMaxSize': int,
    'LoadConfigurationSize': int,
    'VersionInformationSize': int,
    'legitimate': int,
    'Unnamed: 57': float
}

In [None]:
kdata = pd.read_csv(data_dir / "Kaggle-data.csv", dtype=dtypes)

In [None]:
# Extract the input data and output data from the dataset
X = kdata.iloc[:, 3:-2]
Y = kdata["legitimate"]

In [None]:
# Replace any `N/A`s with `0`. This prevents errors in training
X.fillna(0, inplace=True)

In [None]:
X # Preview of the training data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
model = RFC(verbose=1, n_jobs=4)

In [None]:
model.fit(X_train, y_train)

### Validation
Testing out the model on unseen data

In [None]:
model.score(X_test, y_test)

### Feature Importance
This plot shows the top 10 most important features that the model used to determine legitimacy

In [None]:
feature_importances = sorted(
    zip(X.columns,model.feature_importances_),
    key=lambda x:x[1],
    reverse = True
)
label, score = zip(*feature_importances[:10])

plt.bar(label, score)
plt.xticks(rotation=90, size=10)
plt.show()

In [None]:
y_predict = model.predict(X_test)

In [None]:
print(f"True Positives: {sum(y_predict*y_test)} (correctly labeled safe)")
print(f"True Negatives: {sum((y_predict==0)*(y_test==0))} (correctly labeled unsafe)")
print(f"False Positives: {sum(y_predict*(y_test==0))} (incorrectly labeled safe)")
print(f"False Negatives: {sum((y_predict==0)*y_test)} (incorrectly labeled unsafe)")