In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'cic-ddos2019-30gb-full-dataset-csv-files:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3952124%2F6878499%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240527%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240527T065747Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D61f35ab0e564f47f49921b32207f3e3dbe32f0755b804a95cd9a2e1afbf9128126b473f028a92f9286d7d7b829310ddae4b84f2d4ee9f970743217a65899a0f42604355db473a377afb3f9607c6e4270f91ec1076c1fa6311b878ac2e06f652c94860553ba3bf861b06a7573d8b5eefed71487e953b4e6d665ded29ed120c82e27929bc7124a60a3838ea5688e772e7d4779d70bb6fccb5c6fcc149c8c3eacbfec4c8019f633facb4f36a2b93480ddd2e9b3a1386656e44f309fdef85ac48ab3afc0c9a5dbe9b27320d4b0ba5d595adcf847c723ed89ba05891856b31bc730e813dd13a57b33de03f342e64ef8fee4b7c59830def7834590b40da4578795a008'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [None]:
syn_train_path = "/kaggle/input/cic-ddos2019-30gb-full-dataset-csv-files/01-12/DrDoS_UDP.csv"
syn_test_path = "/kaggle/input/cic-ddos2019-30gb-full-dataset-csv-files/03-11/UDP.csv"

syn_train = pd.read_csv(syn_train_path)
syn_train = pd.DataFrame(syn_train)

syn_test = pd.read_csv(syn_test_path)
syn_test = pd.DataFrame(syn_test)


syn_train.replace([np.inf, -np.inf], np.nan, inplace=True)
syn_train.dropna(inplace=True)

syn_test.replace([np.inf, -np.inf], np.nan, inplace=True)
syn_test.dropna(inplace=True)




In [None]:
print(set(syn_test[' Label']))
print(set(syn_train[' Label']))

In [None]:
count_0 = 0
count_1 = 0
for i in range(len(syn_train)):
    if syn_train.iat[i, 87] == "DrDoS_UDP":
        syn_train.iat[i, 87] = 1
        count_1 += 1
    else:
        syn_train.iat[i, 87] = 0
        count_0 +=1

print(count_0)
print(count_1)

count_0 = 0
count_1 = 0

for i in range(len(syn_test)):
    if syn_test.iat[i, 87] == "UDP":
        syn_test.iat[i, 87] = 1
        count_1 +=1
    else:
        syn_test.iat[i, 87] = 0
        count_0 +=1
print(count_0)
print(count_1)

syn_train_x = syn_train.drop([' Label',' Destination IP', ' Source IP', 'Flow ID', ' Timestamp','SimillarHTTP'], axis = 'columns')
syn_train_y = syn_train[' Label']



syn_test_x = syn_test.drop([' Label',' Destination IP', ' Source IP', 'Flow ID', ' Timestamp','SimillarHTTP'], axis = 'columns')
syn_test_y = syn_test[' Label']


In [None]:
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

regressor = RandomForestRegressor(n_estimators=100, random_state=42, oob_score=True)
regressor.fit(syn_train_x, syn_train_y)

# Access the OOB Score
oob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {oob_score}')

# Making predictions on the same data or new data
predictions = regressor.predict(syn_test_x)
print(regressor.feature_importances_)

global_importances = pd.Series(regressor.feature_importances_, index=syn_train_x.columns)
global_importances.sort_values(ascending=True, inplace=True)
global_importances.plot.barh(color='green', figsize = (8, 30))

plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Global Feature Importance - Built-in Method")

# Evaluating the model

mse = mean_squared_error(syn_test_y, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(syn_test_y, predictions)
print(f'R-squared: {r2}')


In [None]:
syn_train_y = syn_train_y.astype('int')
syn_test_y = syn_test_y.astype('int')
print('hello')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

classifier= RandomForestClassifier(n_estimators= 100, criterion="entropy")
classifier.fit(syn_train_x, syn_train_y)

predictions = classifier.predict(syn_test_x)

acc = accuracy_score(syn_test_y, predictions)
print(f'Accuracy: {acc}')

precision = precision_score(syn_test_y, predictions)
print(f'Precision: {precision}')

recall = recall_score(syn_test_y, predictions)
print(f'Recall: {recall}')

f1 = f1_score(syn_test_y, predictions)
print(f'F1 Score: {f1}')


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(random_state=0, max_iter = 1000000)
clf.fit(syn_train_x, syn_train_y)

# Prediction
predictions = clf.predict(syn_test_x)

acc = accuracy_score(syn_test_y, predictions)
print(f'Accuracy: {acc}')

precision = precision_score(syn_test_y, predictions)
print(f'Precision: {precision}')

recall = recall_score(syn_test_y, predictions)
print(f'Recall: {recall}')

f1 = f1_score(syn_test_y, predictions)
print(f'F1 Score: {f1}')

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(syn_train_x, syn_train_y)

# making predictions on the testing set
predictions = gnb.predict(syn_test_x)

# comparing actual response values (y_test) with predicted response values (y_pred)
acc = accuracy_score(syn_test_y, predictions)
print(f'Accuracy: {acc}')

precision = precision_score(syn_test_y, predictions)
print(f'Precision: {precision}')

recall = recall_score(syn_test_y, predictions)
print(f'Recall: {recall}')

f1 = f1_score(syn_test_y, predictions)
print(f'F1 Score: {f1}')