# LIBRARIES 

In [None]:
print("Sayed Muqayyad Hussain")

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as npy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as mat_plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# 

# IMPORTING DATASET

In [None]:
#Read dataset
dataSet = pd.read_csv('/kaggle/input/nslkdd-with-labels/Original Test (with labels).csv') 
#The data is visualized based on the attacks 
# The results in this code is based on the original CICIDS2017 dataset. Please go to cell [21] if you work on the sampled dataset. 

In [None]:
dataSet

In [None]:
print(dataSet.columns)

In [None]:
if 'Difficulty Level' in dataSet.columns:
    attack_counts = dataSet['Difficulty Level'].value_counts()
    print(attack_counts)




# 

# VISUALIZATION OF DATA 

In [None]:
dataSet['Difficulty Level'].unique()

In [None]:
# difficulty_levels = ['BENIGN', 'DoS', 'PortScan', 'BruteForce', 'WebAttack', 'Bot', 'Infiltration']

# Assuming dataSet is a pandas DataFrame containing the 'Difficulty Level' column
# Print the total count of each individual attack
values = list(dataSet['Difficulty Level'].value_counts())

# Plot Figure to display the data based on the given size
plt.figure(figsize=(9, 6))

# Represent the data in the form of a pie chart
plt.pie(values, labels=difficulty_levels, autopct='%.2f%%', shadow=True)
plt.title('Distribution of Different Types of Attacks')
plt.show()

# 

# DATA PREPROCESSING

In [None]:
# Z-score normalization 
# Here we used Z-score inorder to remove the outliers in the dataset and to normalize the features into similar scale
features = dataSet.dtypes[dataSet.dtypes != 'object'].index
dataSet[features] = dataSet[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# Fill empty values by -1
dataSet = dataSet.fillna(-1)

In [None]:
#LabelEncoder is used to convert categorical(non-numeric) data into numeric values.
labelencoder = LabelEncoder()
#The Attacks such as 'BENIGN', 'DoS','PortScan','BruteForce', 'WebAttack','Bot' and 'Infiltration' converted into numeric values based on the alphabetical order
dataSet.iloc[:, -1] = labelencoder.fit_transform(dataSet.iloc[:, -1])

In [None]:
#The count of attacks are found and assigned to respective attacks in numeric form
dataSet.Label.value_counts()

In [None]:
# retain the minority class instances and sample the majority class instances
df_minor = dataSet[(dataSet['Label']==6)|(dataSet['Label']==1)|(dataSet['Label']==4)]
df_major = dataSet.drop(df_minor.index)

In [None]:
X = df_major.drop(['Label'],axis=1) 
y = df_major.iloc[:, -1].values.reshape(-1,1)
print(y)
y=npy.ravel(y)
print(y)

In [None]:
# use k-means to cluster the data samples and select a proportion of data from each cluster
from sklearn.cluster import MiniBatchKMeans
k_means = MiniBatchKMeans(n_clusters=1000, random_state=0).fit(X)

In [None]:
klabel=k_means.labels_
df_major['klabel']=klabel

In [None]:
df_major['klabel'].value_counts()

In [None]:
cols = list(df_major)
cols.insert(78, cols.pop(cols.index('Label')))
df_major = df_major.loc[:, cols]

In [None]:
df_major

In [None]:
def typicalSampling(group):
    name = group.name
    frac = 0.008
    return group.sample(frac=frac)

result = df_major.groupby(
    'klabel', group_keys=False
).apply(typicalSampling)

In [None]:
result['Label'].value_counts()

In [None]:
result

In [None]:
result = result.drop(['klabel'],axis=1)
result = result.append(df_minor)

In [None]:
result.to_csv('CICIDS2017_sample_km.csv',index=0)

In [None]:
# Read the sampled dataset
df=pd.read_csv('CICIDS2017_sample_km.csv')

In [None]:
X = df.drop(['Label'],axis=1).values
y = df.iloc[:, -1].values.reshape(-1,1)
y=npy.ravel(y)

# 

# SPLIT DATA FOR TRAIN AND TEST

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state = 0,stratify = y)

# 

# FEATURE ENIGNEERING

In [None]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X_train, y_train)

In [None]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
featureScaling = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    featureScaling.append(f_list[i][1])

In [None]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
featureScaling = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    featureScaling.append(f_list2[i][1])
    if Sum2>=0.9:
        break        

In [None]:
X_fs = df[featureScaling].values

In [None]:
X_fs.shape

In [None]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBFK(k = 20)
#fcbf.fit(X_fs, y)

In [None]:
X_fss = fcbf.fit_transform(X_fs,y)

In [None]:
X_fss.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_fss,y, train_size = 0.8, test_size = 0.2, random_state = 0,stratify = y)

In [None]:
X_train.shape

In [None]:
pd.Series(y_train).value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(n_jobs=-1,sampling_strategy={2:1000,4:1000})

In [None]:
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
pd.Series(y_train).value_counts()