In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
import time

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn import metrics

data = pd.read_csv('dataset_sdn.csv')

##### Here we see that the label contains boolean values: 0 - Benign, 1-Maliciuous 
data.label.unique()

data.label.value_counts()

#label_dict = dict(data.label.value_counts())
#sns.countplot(x=data.label)
# plt.show()

'''labels = ["Maliciuous",'Benign']
sizes = [dict(data.label.value_counts())[0], dict(data.label.value_counts())[1]]
plt.figure(figsize = (13,8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.legend(["Maliciuous", "Benign"])
plt.title('The percentage of Benign and Maliciuos Requests in dataset')
plt.show()

#data.describe()

#data.isnull().sum()

#### Let's support which columns NUMERIC and which is OBJECT

numeric_df = data.select_dtypes(include=['int64', 'float64'])
object_df = data.select_dtypes(include=['object'])
numeric_cols = numeric_df.columns
object_cols = object_df.columns
print('Numeric Columns: ')
print(numeric_cols, '\n')
print('Object Columns: ')
print(object_cols, '\n')
print('Number of Numeric Features: ', len(numeric_cols))
print('Number of Object Features: ', len(object_cols))'''

# Assuming 'data' is the original dataframe
# Separate the data based on unique label values
df_label_0 = data[data['label'] == 0]
df_label_1 = data[data['label'] == 1]

# Resample to get equal number of rows for each label value
df_label_0_resampled = resample(df_label_0,
                                replace=False, # sample without replacement
                                n_samples=len(df_label_1), # sample same number of rows as label 1
                                random_state=42) # set random state for reproducibility

# Combine the resampled dataframes
balanced_data = pd.concat([df_label_0_resampled, df_label_1])

# Shuffle the rows to avoid any ordering bias
data = balanced_data.sample(frac=1, random_state=42)


data.label.value_counts()

df1 = data.copy()

df1 = df1.dropna()

df1.columns

#df1.info()

important_features = [
    'src',
    'pktcount',
    'dst',
    'byteperflow',
    'pktperflow',
    'pktrate',
    'tot_kbps',
    'rx_kbps',
    'flows',
    'bytecount',
    'dt',
    'Protocol',
    'dur',
    'tot_dur'
                      
                     ]


weights = [
    17.87,
    15.16,
    13.64,
    12.97,
    11.35,
    11.35,
    9.68,
    9.66,
    8.95,
    4.92,
    2.33,
    1.31,
    1.11,
    1.11
]

weighted_features = pd.DataFrame({'features':important_features,
                                 'weights':weights})
weighted_features

### But we dont need src, dst, dt, So, we will drop them
X = df1[important_features]
y = df1.label

X = X.drop(['src', 'dst', 'dt'], axis=1)

X.head()

abs(X.corr(numeric_only=True))

X = X.drop(['dur', "pktrate", "pktperflow"], axis=1)

X.columns

X = pd.get_dummies(X)

class Model:
    def __init__(self, data, y):
        self.data = data
        self.y = y
        X = preprocessing.StandardScaler().fit(self.data).transform(self.data)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, self.y, random_state=42, test_size=0.3)  
        

    def LogisticRegression(self):
        solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

        start_time = time.time()
        results_lr = []
        accuracy_list = []
        for solver in solvers:
            LR = LogisticRegression(C=0.03, solver=solver).fit(self.X_train, self.y_train)
            predicted_lr = LR.predict(self.X_test)
            accuracy_lr = accuracy_score(self.y_test, predicted_lr)
            #print("Accuracy: %.2f%%" % (accuracy_lr * 100.0))
            #print('################################################################')
            results_lr.append({'solver' : solver, 'accuracy': str(round(accuracy_lr * 100, 2)) + "%", 
                                  'Coefficients': {'W' : LR.coef_, 'b': LR.intercept_}})
            
            accuracy_list.append(accuracy_lr)
       
        solver_name = solvers[accuracy_list.index(max(accuracy_list))]
        LR = LogisticRegression(C=0.03, solver=solver_name).fit(self.X_train,self.y_train)
        predicted_lr = LR.predict(self.X_test)
        accuracy_lr = accuracy_score(self.y_test, predicted_lr)
        print("Accuracy: %.2f%%" % (accuracy_lr * 100.0), '\n')
        print("------------------------------------------------------------------------")
        print('Best solver is : ', solver_name)
        print("------------------------------------------------------------------------")
        print(classification_report(predicted_lr, self.y_test), '\n')
        print("------------------------------------------------------------------------")
        print("--- %s seconds --- time for LogisticRegression" % (time.time() - start_time))
    
        return LR


M = Model(X,y)
M.LogisticRegression()

Accuracy: 73.89% 

------------------------------------------------------------------------
Best solver is :  liblinear
------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.71      0.75      0.73     11587
           1       0.76      0.73      0.74     12756

    accuracy                           0.74     24343
   macro avg       0.74      0.74      0.74     24343
weighted avg       0.74      0.74      0.74     24343
 

------------------------------------------------------------------------
--- 0.9074513912200928 seconds --- time for LogisticRegression
