IMPORTS

In [None]:
# Load the Pandas libraries with alias 'pd' 
import pandas as pd 
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.feature_selection import VarianceThreshold
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import matplotlib.pyplot as plt

READING DATASET AND EXAMINING LABEL


In [None]:
dataset = pd.read_csv("c:/Darknet_all.csv" , low_memory=False) 
df = pd.DataFrame(dataset)
display(df)
print (df['Label'].value_counts(ascending=True))



After examining the Label values , we found out some duplicate classes so we dropped them .

In [None]:



dup_values = ['Video-streaming', 'AUDIO-STREAMING','File-transfer' ]
df = df[df.Label.isin(dup_values) == False]    #returing df without duplicated classes
print (df['Label'].value_counts(ascending=True))




Missing data is data which is not available ( NULL) or infinite values , we will remove the rows which contain any missing data. This shall not affect the model as the dataset is big enough.

In [None]:


df.replace([np.inf, -np.inf], np.nan, inplace=True) #replace infinity values with NaN
df.dropna(inplace=True) #dropping rows with missing values  
print("Dataset size after removal : " ,df.shape)




Dropping unneccassory features like id ,and converting ips to binary then int..

In [None]:
df.drop('Flow ID', axis=1, inplace=True)


i=0
for ip in df['Src IP'].values:
    z = 0
    parts = ip.split('.')
    z = (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
    ip= z
    df['Src IP'].values[i]=ip
    i=i+1



i=0    
for ip in df['Dst IP'].values:
    z = 0
    parts = ip.split('.')
    z = (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
    ip= z
    df['Dst IP'].values[i]=ip
    i=i+1

Removing 0 variance features..

In [None]:



df_nolabel = df.iloc[:,:-1].values

sel = VarianceThreshold(threshold=0)    
sel.fit_transform(df_nolabel)
arr=sel.get_support()

arr1=np.argwhere(arr == 0)

print("Dataset size before removal " ,df.shape ,"\n")
features_to_remove=[]
for i in arr1 :
    features_to_remove.append((df.columns[i].values[0]))

print("0 variance features : " ,features_to_remove ,"\n")
df.drop(features_to_remove, axis=1, inplace=True)
print("Dataset size after removal " ,df.shape , "\n")

Features scaling and splitting the dataset into training and testing subsets.

In [None]:
count=df.shape[1]
x = df.iloc[:,:-1].values
y = df.iloc[:, count-1].values

scaler = StandardScaler().fit(x)
x = scaler.transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)



SMOTE technique to overcome dataset imbalance ..

In [None]:

counter = Counter(y_train) 
print('Before', counter) 
smtom = SMOTEENN () 
X_train_smtom, y_train_smtom = smtom.fit_resample (X_train, y_train)
counter = Counter(y_train_smtom) 
print('After', counter)


In [None]:
# SVM
clf = svm.SVC(kernel='linear')
clf.fit(X_train_smtom, y_train_smtom)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))