# Importing the required modules

In [52]:
# Module used for handling the file system.
import os

# Modules used for data handling and linear algebra.
import numpy as np
import pandas as pd

# Modules used for python utilities
from collections import Counter
import random as r

# Modules used for data visualisation.
import matplotlib.pyplot as plt
import matplotlib.colors as clr
import seaborn as sns
sns.set_style("whitegrid")

# Modules used for data preprocessing.
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Module used for test-train split
from sklearn.model_selection import train_test_split

# Module used for cross validation
from sklearn.model_selection import cross_val_score

# Modules used for model building.
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import tree

# Module used for model evaluation.
from sklearn import metrics


# Dataset Preprocessing

In [3]:
class DatasetStructurer:
    
    def __init__(self, dataset_dir : str) -> None:
        self.PARENT = dataset_dir
        self.nt = None
        self.cnt = None
        self.numerical = {"discrete":[], "continuous":[]}
        self.categorical = []
        self.labels = []
        
    def combine_all_csvs(self):
        all_traces = [pd.read_csv(os.path.join(self.PARENT, file)) for file in os.listdir(self.PARENT)]
        self.nt = pd.concat(all_traces, ignore_index=True)
        files = os.listdir(self.PARENT)
        self.labels = [[files[i].split("_")[0]]*len(all_traces[i]) for i in range(len(files))]
        temp = []
        for i in self.labels:
            temp += i
        self.nt["type"] = temp


## Initializing the path for the data

In [4]:
ds = DatasetStructurer("../data/network-data/")

## Combine all the CSV files into a single file

In [5]:
ds.combine_all_csvs()

## Glimpse into the unprocessed dataset

In [6]:
ds.nt.head()

Unnamed: 0,time,proto,data_len,ip_src,ip_dst,src_port,dst_port,type
0,1551439000.0,17,58,192.168.1.149,192.168.1.1,52835,53,bulk
1,1551439000.0,17,230,192.168.1.1,192.168.1.149,53,52835,bulk
2,1551439000.0,6,603,192.168.1.149,80.249.99.148,51850,80,bulk
3,1551439000.0,6,1448,80.249.99.148,192.168.1.149,80,51850,bulk
4,1551439000.0,6,1448,80.249.99.148,192.168.1.149,80,51850,bulk


## Segreggating columns

In [7]:
categorical = ["proto","ip_src","ip_dst", "src_port", "dst_port", "type"]
ds.nt  = ds.nt.drop(["time"],axis=1)
numerical = list(set(ds.nt.columns).difference(categorical))

## Standardization of the numerical features

In [8]:
scaler = StandardScaler()
scaler.fit(ds.nt[numerical])
ds.nt[numerical] = scaler.transform(ds.nt[numerical])

## Glimpse into the dataset after Standardization

In [9]:
ds.nt.head()

Unnamed: 0,proto,data_len,ip_src,ip_dst,src_port,dst_port,type
0,17,-6.704513,192.168.1.149,192.168.1.1,52835,53,bulk
1,17,-5.852251,192.168.1.1,192.168.1.149,53,52835,bulk
2,6,-4.004032,192.168.1.149,80.249.99.148,51850,80,bulk
3,6,0.182952,80.249.99.148,192.168.1.149,80,51850,bulk
4,6,0.182952,80.249.99.148,192.168.1.149,80,51850,bulk


## Label Encoding for the categorical features

In [10]:
for i in categorical:
    enc = LabelEncoder()
    enc.fit(ds.nt[i])
    ds.nt[i] = enc.transform(ds.nt[i])

## Glimpse into the dataset post Label Encoding

In [11]:
ds.nt.head()

Unnamed: 0,proto,data_len,ip_src,ip_dst,src_port,dst_port,type
0,1,-6.704513,536,502,4009,2,0
1,1,-5.852251,529,504,2,4037,0
2,0,-4.004032,536,1159,3774,5,0
3,0,0.182952,1216,504,5,3802,0
4,0,0.182952,1216,504,5,3802,0


## Test-Train Split

In [12]:
X_train,X_test,y_train,y_test = train_test_split(ds.nt,
    ds.nt["type"],
    test_size = 0.1,
    train_size=0.90,
    random_state = 0
)

In [13]:
temp = X_train.pop("type")
temp = X_test.pop("type")

# Exploring tree based models with 90-10 test-train split

## DecisionTreeClassifier

### Initializing the classifier

In [46]:
clf = DecisionTreeClassifier()

### Fitting the model

In [47]:
clf = clf.fit(X_train,y_train)

### Predictions over the test dataset

In [48]:
y_pred = clf.predict(X_test)

### Accuracy score for the classifier

In [49]:
print("Accuracy on Test Dataset :",metrics.accuracy_score(y_test, y_pred))
print("Accuracy on Train Dataset :",metrics.accuracy_score(y_train, clf.predict(X_train)))

Accuracy on Test Dataset : 0.9997062442259961
Accuracy on Train Dataset : 0.999999842321109


### Parameters of the classifier

In [59]:
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

### Classification report for the classifier's predictions

In [50]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    586800
           1       0.83      0.82      0.83       186
           2       1.00      1.00      1.00      7508
           3       1.00      1.00      1.00     95690
           4       0.99      0.99      0.99     14483

    accuracy                           1.00    704667
   macro avg       0.96      0.96      0.96    704667
weighted avg       1.00      1.00      1.00    704667



We can observe that the imbalance in the dataset makes the model more biased to all classes except class 1 due to a less value of support.

### 5-fold cross-validation

In [51]:
scores = cross_val_score(clf, ds.nt.drop(["type"], axis=1), ds.nt["type"], cv=5)
print("5-fold cross validation :", np.mean(scores))

5-fold cross validation : 0.8419264702334578


## RandomForestClassifier

### Initializing the RandomForestClassifier

In [29]:
clf = RandomForestClassifier(n_estimators=10)

### Fitting the classifier

In [30]:
clf.fit(X_train,y_train)

RandomForestClassifier(n_estimators=10)

### Accuracy on train dataset

In [31]:
y_pred_train = clf.predict(X_train)
metrics.accuracy_score(y_train,y_pred_train)

0.9999847051475693

### Accuracy on test dataset

In [32]:
y_pred_test = clf.predict(X_test)
metrics.accuracy_score(y_test,y_pred_test)

0.999743141086499

### Classification report for classifier's predictions

In [33]:
print(metrics.classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    586800
           1       0.85      0.81      0.83       186
           2       1.00      1.00      1.00      7508
           3       1.00      1.00      1.00     95690
           4       0.99      1.00      0.99     14483

    accuracy                           1.00    704667
   macro avg       0.97      0.96      0.96    704667
weighted avg       1.00      1.00      1.00    704667



- We can infer that the results for the class 1 have improved by the Random Forest model compared to its base model, Decision Tree model. 
- Other classes are unaffected.

### 5-fold cross validation

In [39]:
scores = cross_val_score(clf, ds.nt.drop(["type"], axis=1), ds.nt["type"], cv=5)
print("5-fold cross validation :", np.mean(scores))

5-fold cross validation : 0.8549927838255517


## ExtraTreesClassifier

### Initializing the classifier

In [40]:
clf = ExtraTreesClassifier(n_estimators=10)

### Fitting the classifier

In [41]:
clf.fit(X_train, y_train)

ExtraTreesClassifier(n_estimators=10)

### Accuracy on train dataset

In [42]:
y_pred_train = clf.predict(X_train)
metrics.accuracy_score(y_train,y_pred_train)

0.999999842321109

### Accuracy on test dataset

In [43]:
y_pred_test = clf.predict(X_test)
metrics.accuracy_score(y_test,y_pred_test)

0.9997530748566344

### Classification report for the classifier's predictions

In [44]:
print(metrics.classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    586800
           1       0.88      0.84      0.86       186
           2       1.00      1.00      1.00      7508
           3       1.00      1.00      1.00     95690
           4       0.99      1.00      0.99     14483

    accuracy                           1.00    704667
   macro avg       0.97      0.97      0.97    704667
weighted avg       1.00      1.00      1.00    704667



- We can infer that the results for the class 1 have improved by the Extra Trees model compared to the previous models.
- Other classes are unaffected.

### 5-fold cross-validation

In [45]:
scores = cross_val_score(clf, ds.nt.drop(["type"], axis=1), ds.nt["type"], cv=5)
print("5-fold cross validation :", np.mean(scores))

5-fold cross validation : 0.8185778530852161


# Conclusions

- For the available dataset we have performed Standardization and Label Encoding for the numerical and categorical features respectively. This preprocessing ensures that the dataset has uniformity across every feature.
- We explore Machine Learning models like Decision Trees and Ensemble models like Random Forest and Extra Trees. We observe that the base Decision Tree model give greater than 99% on both test and train datasets. However, since we have a large number of entries for each class in the dataset, Cross Validation becomes very important.
- Below is the table of results for each tree-based classifier:
<table>
  <tr>
    <th> Model </th>
    <th> Category </th>
    <th> Type of model </th>
    <th> Accuracy on train dataset </th>
    <th> Accuracy on test dataset </th>
    <th> Accuracy on 5-fold cross validation </th>
  </tr>
  <tr>
    <td> Decision Tree </td>
    <td> Tree-based </td>
    <td> Base Model </td>
    <td> 99.999% </td>
    <td> 99.970% </td>
    <td> 84.192% </td>
  <tr>
  <tr>
    <td> Random Forest </td>
    <td> Tree-based </td>
    <td> Ensemble Model </td>
    <td> 99.999% </td>
    <td> 99.974% </td>
    <td> 85.499% </td>
  <tr>
  <tr>
    <td> Extra Trees </td>
    <td> Tree-based </td>
    <td> Ensemble Model </td>
    <td> 99.999% </td>
    <td> 99.975% </td>
    <td> 81.857% </td>
  </tr>
</table>

- Based on the results in the above table, we shall choose the Random Forest Classifier because its cross validation accuracy is significantly higher that others. However, we can observe that there is a significant amount of overfitting in the model because there is a large different between the accuracies of before and after cross validation. This could also indicate that we shall use a higher value of K for the dataset.