# CIC Darknet 2020

We will be using the darknet dataset from Canada Institute of Cyber Security. Our goal is to work with the data to categorize darknet traffic. 

Steps we will take include:
1. Load data
2. Analyze data
   1. Cleaning the data
   2. Data Analysis
3. Visualize data
4. Split data into train-test set
5. Train and test a handful of models
6. Select the best model
7. Deploy the best model


# Importing Dataset


In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import drive

To use the files in Google Collab, we need to follow these steps for each person.

1. Go to 'Shared with me/Cyber Research/Darknet' in your google drive
2. Right click the 'Experiments' Folder
3. Select 'Add shortcut to Drive'

This way, the code down below will work for you too!

In [None]:
!pwd
# Mounting your personal drive so we can use the csv file in our 'Code' folder
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Code'
%ls

In [None]:
input = pd.read_csv('Darknet.CSV')

In [None]:
df = input

# Analyzing

In [None]:
# First look at the data 
df.head(5)

In [None]:
# Here we can see the dimensions of the data
df.shape

In [None]:
# Showing a general stats for the data
df.describe()

In [None]:
# Let's see what we will be categorizing data into
print(df.groupby('Label').size())

In [None]:
print(df.groupby('Label1').size())

# Cleaning


There are misspelling of data in Label1, so here we fix that. After that, we want relabel the classifing columns so it's easier to understand. 

In [None]:
df['Label1'] = df['Label1'].str.lower()
df.Label1.unique()

If a column has only one unique value, we don't need to keep that. Here is how we removed those.

In [None]:
def DropSingleValueColumns():
    threshold = 1
    # get number of unique values for each column
    print(df.shape)
    for col in df.columns:  # Loop through columns
        if len(df[col].unique()) <= threshold:  # Find unique values in column along with their length and if len is == 1 then it contains same values
            df.drop([col], axis=1, inplace=True)  # Drop the column
    print(df.shape)

DropSingleValueColumns()

Next, we removed duplicated rows.

In [None]:
df.drop_duplicates(inplace=True)
print(df.shape)

Infinite numbers are excessive outliers, we want to remove those and NULL values.


In [None]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

print(df.shape)

## X and y Separation

We need different dataframes for our data and labels

In [None]:
# We can use the 'apply' function in pandas to do this really easily

df['Label_dark'] = df['Label'].apply(lambda x: 1 if x == 'VPN' or 
                                     x == 'Tor' else 0)

In [None]:
y = df[['Label', 'Label1', 'Label_dark']]

In [None]:
# This will remove strings the strings from our X dataset

X = df.select_dtypes(exclude=object)
X = X.drop('Label_dark',axis=1)

The data has classifications for Non-Tor or Tor combined with Non-VPN or VPN. When trying to classify what sort of traffic is in darknet, we want to combine VPN and Tor as "Darknet" with the rest being "Non-Darknet".

# Balancing

Here we can see our data is unbalanced. We can apply SMOTE(Synthetic Minority Over-sampling Technique)to overcome this. 

In [None]:
pd.options.display.max_columns=None
pd.options.display.max_rows=None

In [None]:
# from imblearn.over_sampling import SMOTE

In [None]:
# sm = SMOTE(random_state=42)

# x_sm, y_sm = sm.fit_resample(X, y)

# print(f'''Shape of X before SMOTE: {X.shape}
# Shape of X after SMOTE: {x_sm.shape}''')

# print('\nBalance of positive and negative classes (%):')
# #y_sm.value_counts(normalize=True) * 100
# unique, counts = np.unique(y_sm, return_counts=True)

# dict(zip(unique, counts))

# Visualization 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
%matplotlib inline

In [None]:
# Creating pandas series for count of label1

label3_count = df.Label_dark.value_counts()

plt.figure(figsize=(6,6))
sns.set_style('whitegrid')
plt.title('Where Traffic Happened')
sns.barplot(x=label3_count.index, y=label3_count)
plt.ylabel('Count')
plt.xlabel('Type of Traffic')

In [None]:
# Creating pandas series for count of label1
label1_count = df.Label1.value_counts()
label1_count

plt.figure(figsize=(12,8))
sns.set_style('whitegrid')
plt.title('Categories Where Traffic Went')
sns.barplot(x=label1_count.index, y=label1_count)
plt.ylabel('Count')
plt.xlabel('Type of Traffic')

In [None]:
# Group plot of traffic depending on darknet or non-darknet

sns.set_style('whitegrid')
g = sns.catplot(x='Label1', col='Label_dark',
                data=df, kind='count', 
                height=6, aspect = 1.4)
(g.set_axis_labels("Traffic Type", "Count")
  .set_titles("{col_name}")
  .set(ylim=(0,33000))
  .despine(left=True))  

# Feature Selection

## Correlation Heatmap

In [None]:
# Correllation

correlation = df.corr()
plt.figure(figsize =(30,30))
sns.heatmap(correlation, vmax=1, square=True, cmap='YlGnBu')

## $X^2$ Test

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import chi2

In [None]:
#splitting the data into feature (x) and target (y)

chi_X=X
chi_Y=y['Label_dark']

# Values must be equalized between 0 and 1
scaler = MinMaxScaler()
chi_X = pd.DataFrame( 
    scaler.fit_transform(chi_X), 
    columns=chi_X.columns 
)

# Fit model
chi_scores = chi2(chi_X, chi_Y)

# Retrieve the P-Scores for each feature
p_vals = pd.Series(chi_scores[1],index = chi_X.columns)

# Sort by lower to higher p-values 
p_vals.sort_values(ascending=True, inplace=True)

print("\t X^2 p-values")
# Print all p-values in ascending order base on p-value, if they are >0.5, then mark them as red
p_vals

In [None]:
# visualize data
plt.figure(figsize=(6, 14))
plt.title('Chi Squared Values', fontsize=16)
plt.xlabel('P-Value', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.axvline(x=0.5,linewidth=2, color='k')
sns.set_theme(style="white")
sns.barplot(x=p_vals, y=p_vals.index)

## Pearson Correlation

In [None]:
from scipy.stats import pearsonr

In [None]:
# Stores the outcomes
pears_X = X.copy(deep=True)
pears_Y = y.copy(deep=True)
pears_Y = pears_Y['Label_dark']

# Calculate r-value for each feature
# Store results in a dictionary with the label and r-value
pears_dict = {}
for i in pears_X.columns:
  corr, _ = pearsonr(pears_X[i], pears_Y)
  pears_dict[pears_X[i].name] = corr

# Create data frames from the dictionaries
pears_X = pd.DataFrame.from_dict(pears_dict, orient='index')

# Rename the columns
pears_X.columns = ['r-value']

# Take the absolute value of all the r-values.
pears_X['r-value'] = pears_X['r-value'].abs()
pears_X.sort_values('r-value', inplace=True, ascending=False)
print('\t Pearson r-Values')
r_vals = pears_X.squeeze()
r_vals

In [None]:
plt.figure(figsize=(6,14))
plt.title("Absolute Values of the Pearson Correlation Results", fontsize=16)

sns.barplot(x=pears_X['r-value'], y=pears_X.index)
plt.xlabel('r-value', fontsize=14)
plt.ylabel('Feature' ,fontsize=14)

plt.axvline(x=0.1,linewidth=2, color='k')

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn import tree
# from sklearn.ensemble import RandomForestClassifier
# from random import sample

In [None]:
# # Here, we are showing how each feature's importance to the random forest

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)
# importance = sorted(zip(map(lambda x: round(x, 4),
#                             rf.feature_importances_), X_train), reverse=True)

In [None]:
# # Now, we plot the importance to see which columns we want to keep

# labels = [row[1] for row in importance]
# info = [row[0] for row in importance]
# plt.figure(figsize=(8,14))
# plt.title('Feature Importance')
# sns.barplot(x = info, y= labels)

## Selecting Columns

In [None]:
# temp = df.copy(deep=True)

In [None]:
# top = 40

# top_features = [x for x in p_vals.index[:top] if x in  r_vals.index[:top]]
# print(len(top_features))

# X = temp[top_features]

# Feature Distribution

In [None]:
# data_col = np.transpose(X.columns).to_list()
# df['Label_dark'] = df['Label_dark'].apply(lambda x: 'Darknet' if x == 1 else 'Non-Darknet')

In [None]:
# for col in data_col:
#   g = sns.displot(data=df, x=col, col='Label_dark',bins=30, height=4, aspect=1.4)
#   g.set_titles('{col_name}')
#   g.tight_layout()

# Train vs. Test

In [None]:
X.to_csv(r'/content/drive/My Drive/Code/cleaned_X.csv', index=False)
y.to_csv(r'/content/drive/My Drive/Code/cleaned_y.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Splitting into training and testing data sets

X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.2, random_state=1)


In [None]:
# Now, splitting into classification labels and data
y_train = y_train['Label_dark']
y_test = y_test['Label_dark']

Build and Evaluate Models

In [None]:
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
def runExperiments(X_train, Y_train):
    random_seed = 100
    scoring = 'accuracy'
    models = []
    models.append(('LR', LogisticRegression(solver='liblinear', multi_class='auto')))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('AB', AdaBoostClassifier(n_estimators=100)))
    models.append(('DT', DecisionTreeClassifier()))
    models.append(('RF', RandomForestClassifier(n_estimators=10)))

    # evaluate each model in turn using 10-fold cross-validation
    results = []
    classifiers = []
    accuracies = []
    for classifier, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=random_seed)
        cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
        accuracies.append((cv_results.mean(), classifier))
        results.append(cv_results)
        classifiers.append(classifier)
        msg = "%s: %f (%f)" % (classifier, cv_results.mean(), cv_results.std())
        print(msg)
    return results, accuracies, classifiers

In [None]:
#results, accuracies, classifiers = runExperiments(X_train, y_train)


# Classification

## Steps
1. Initialize the classifier
2. Train the classifier
3. Predict the target
4. Evaluate, find error

In [None]:
accuracy = []
tests = []

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

lr = LogisticRegression(random_state=0, solver='sag')
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_test)

In [None]:
accuracy.append(metrics.accuracy_score(y_test, lr_pred))
tests.append('Logistical Regression')

print('Accuracy of training data', lr.score(X_train, y_train))
print('Accuracy of testing data ', metrics.accuracy_score(y_test, lr_pred), end='\n\n')
print(metrics.confusion_matrix(y_test, lr_pred), end='\n\n')
print(metrics.classification_report(y_test, lr_pred))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train , y_train)

rf_pred = rf.predict(X_test)  

In [None]:
accuracy.append(metrics.accuracy_score(y_test, rf_pred))
tests.append('Random Forest')

print('Accuracy of training data', rf.score(X_train, y_train))
print('Accuracy of testing data ', metrics.accuracy_score(y_test, rf_pred), end='\n\n')
print(metrics.confusion_matrix(y_test, rf_pred), end='\n\n')
print(metrics.classification_report(y_test, rf_pred))

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train , y_train)

gb_pred = gb.predict(X_test)

In [None]:
accuracy.append(metrics.accuracy_score(y_test, gb_pred))
tests.append('Gradient Boosting')

print('Accuracy of training data', gb.score(X_train, y_train))
print('Accuracy of testing data ', metrics.accuracy_score(y_test, gb_pred), end='\n\n')
print(metrics.confusion_matrix(y_test, gb_pred), end='\n\n')
print(metrics.classification_report(y_test, gb_pred))

## Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(X_train , y_train)

ada_pred = ada.predict(X_test)

In [None]:
accuracy.append(metrics.accuracy_score(y_test, ada_pred))
tests.append('Ada Boost')

print('Accuracy of training data', ada.score(X_train, y_train))
print('Accuracy of testing data ', metrics.accuracy_score(y_test, ada_pred), end='\n\n')
print(metrics.confusion_matrix(y_test, ada_pred), end='\n\n')
print(metrics.classification_report(y_test, ada_pred))

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

knn_pred = knn.predict(X_test)

In [None]:
accuracy.append(metrics.accuracy_score(y_test, knn_pred))
tests.append('K-Nearest Neighbors')

print('Accuracy of training data', ada.score(X_train, y_train))
print('Accuracy of testing data ', metrics.accuracy_score(y_test, knn_pred), end='\n\n')
print(metrics.confusion_matrix(y_test, knn_pred), end='\n\n')
print(metrics.classification_report(y_test, knn_pred))

## SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sdg = SGDClassifier()
sdg.fit(X_train, y_train)

sdg_pred = sdg.predict(X_test)

In [None]:
accuracy.append(metrics.accuracy_score(y_test, sdg_pred))
tests.append('SDG Classifier')

print('Accuracy of training data', sdg.score(X_train, y_train))
print('Accuracy of testing data ', metrics.accuracy_score(y_test, sdg_pred), end='\n\n')
print(metrics.confusion_matrix(y_test, sdg_pred), end='\n\n')
print(metrics.classification_report(y_test, sdg_pred))

# Results


In [None]:
plt.figure(figsize=(len(accuracy)*2,8))
sns.barplot(x=tests, y=[100*x for x in accuracy])
plt.title('Results', fontsize = 22)
plt.xlabel('Test', fontsize=14)
plt.ylabel('Accuracy (%)', fontsize=14)
for i in range(len(accuracy)):
  plt.text(i-.15,accuracy[i]*100+1,str(round(accuracy[i]*100,2))+'%')

Binary classification

Make Predictions using Random Forest(RF)




*   RF provides maximum accuracy result over all the classifiers
*   check the accuracy of the best model using validation set



In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
def testBestModel(model, X_validation, Y_validation):
    # make predictions on validation dataset
    Y_predictions = rf.predict(X_validation)
    print("Accuracy = {:.2f}".format(accuracy_score(Y_validation, Y_predictions)))
    print('Confusion Matrix:')
    print(confusion_matrix(Y_validation, Y_predictions))
    print('Classification Report:')
    print(classification_report(Y_validation, Y_predictions))
    return Y_predictions

In [None]:
# train the best model and find the validation accuracy
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
y_predictions = testBestModel(rf, X_test, y_test)