In [13]:
#Random Forest Stratified
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'stratified_sample.csv'
chunksize = 500000

# Helper function for feature engineering
def feature_engineering(chunk):
    if 'Tot sum' in chunk.columns and 'Duration' in chunk.columns:
        chunk['Packets_Duration_Ratio'] = chunk['Tot sum'] / (chunk['Duration'] + 1e-6)
    return chunk

# Prepare the sampled dataset
sampled_df = pd.read_csv(sampled_file)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Load original data in chunks and label as original (0)
original_chunks = []
for original_file in original_files:
    print(f"Processing file: {original_file}")
    for chunk in pd.read_csv(os.path.join(folder_path, original_file), chunksize=chunksize):
        chunk = feature_engineering(chunk)
        chunk['is_sampled'] = 0
        original_chunks.append(chunk.sample(frac=0.05, random_state=42))  # match the sampling percentage
        
print("Concatenating the original dataset")
original_df = pd.concat(original_chunks, ignore_index=True)

# Combine datasets
print("Combining the datasets")
combined_df = pd.concat([original_df, sampled_df], ignore_index=True).dropna()

# Define features and target
print("Defining features")
X = combined_df.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_df['is_sampled']

# Split into train-test
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Random Forest classifier
print("performing Random Forest")
# clf = RandomForestClassifier(n_estimators=100, random_state=42) Reduce for time complexity,
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', n_jobs=-1)

clf.fit(X_train, y_train)

# Evaluate
print("performing calculations")
predictions = clf.predict(X_test)
print("Calculating accuracy")
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Original', 'Sampled']))


Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00013-363d1ba3-8

In [15]:
#Random Forest SMOTE
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'smote_data.csv'
chunksize = 500000

# Helper function for feature engineering
def feature_engineering(chunk):
    if 'Tot sum' in chunk.columns and 'Duration' in chunk.columns:
        chunk['Packets_Duration_Ratio'] = chunk['Tot sum'] / (chunk['Duration'] + 1e-6)
    return chunk

# Prepare the sampled dataset
print("Reading sampled file")
sampled_df = pd.read_csv(sampled_file)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Load original data in chunks and label as original (0)
original_chunks = []
for original_file in original_files:
    print(f"Processing file: {original_file}")
    for chunk in pd.read_csv(os.path.join(folder_path, original_file), chunksize=chunksize):
        chunk = feature_engineering(chunk)
        chunk['is_sampled'] = 0
        original_chunks.append(chunk.sample(frac=0.05, random_state=42))  # match the sampling percentage
        
print("Concatenating the original dataset")
original_df = pd.concat(original_chunks, ignore_index=True)

# Combine datasets
print("Combining the datasets")
combined_df = pd.concat([original_df, sampled_df], ignore_index=True).dropna()

# Define features and target
print("Defining features")
X = combined_df.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_df['is_sampled']

# Split into train-test
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Random Forest classifier
print("performing Random Forest")
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', n_jobs=-1)

clf.fit(X_train, y_train)

# Evaluate
print("performing calculations")
predictions = clf.predict(X_test)
print("Calculating accuracy")
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Original', 'Sampled']))


Reading sampled file
Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: 

In [7]:
#Random Forest Data Diffusion
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'data_diffusion_data.csv'
chunksize = 500000

# Helper function for feature engineering
def feature_engineering(chunk):
    if 'Tot sum' in chunk.columns and 'Duration' in chunk.columns:
        chunk['Packets_Duration_Ratio'] = chunk['Tot sum'] / (chunk['Duration'] + 1e-6)
    return chunk

# Prepare the sampled dataset
print("Reading sampled file")
sampled_df = pd.read_csv(sampled_file)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Load original data in chunks and label as original (0)
original_chunks = []
for original_file in original_files:
    print(f"Processing file: {original_file}")
    for chunk in pd.read_csv(os.path.join(folder_path, original_file), chunksize=chunksize):
        chunk = feature_engineering(chunk)
        chunk['is_sampled'] = 0
        original_chunks.append(chunk.sample(frac=0.05, random_state=42))  # match the sampling percentage
        
print("Concatenating the original dataset")
original_df = pd.concat(original_chunks, ignore_index=True)

# Combine datasets
print("Combining the datasets")
combined_df = pd.concat([original_df, sampled_df], ignore_index=True).dropna()

# Define features and target
print("Defining features")
X = combined_df.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_df['is_sampled']

# Split into train-test
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Random Forest classifier
print("performing Random Forest")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
print("performing calculations")
predictions = clf.predict(X_test)
print("Calculating accuracy")
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Original', 'Sampled']))


Reading sampled file
Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: 

In [9]:
#Random forest Undersampled
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'undersampled_data.csv'
chunksize = 500000

# Helper function for feature engineering
def feature_engineering(chunk):
    if 'Tot sum' in chunk.columns and 'Duration' in chunk.columns:
        chunk['Packets_Duration_Ratio'] = chunk['Tot sum'] / (chunk['Duration'] + 1e-6)
    return chunk

# Prepare the sampled dataset
print("Reading sampled file")
sampled_df = pd.read_csv(sampled_file)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Load original data in chunks and label as original (0)
original_chunks = []
for original_file in original_files:
    print(f"Processing file: {original_file}")
    for chunk in pd.read_csv(os.path.join(folder_path, original_file), chunksize=chunksize):
        chunk = feature_engineering(chunk)
        chunk['is_sampled'] = 0
        original_chunks.append(chunk.sample(frac=0.05, random_state=42))  # match the sampling percentage
        
print("Concatenating the original dataset")
original_df = pd.concat(original_chunks, ignore_index=True)

# Combine datasets
print("Combining the datasets")
combined_df = pd.concat([original_df, sampled_df], ignore_index=True).dropna()

# Define features and target
print("Defining features")
X = combined_df.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_df['is_sampled']

# Split into train-test
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Random Forest classifier
print("performing Random Forest")
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', n_jobs=-1)

clf.fit(X_train, y_train)

# Evaluate
print("performing calculations")
predictions = clf.predict(X_test)
print("Calculating accuracy")
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Original', 'Sampled']))


Reading sampled file
Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: 

In [11]:
from IPython.display import display, Javascript

display(Javascript("""
Jupyter.keyboard_manager.command_shortcuts.remove_shortcut('D,D');
"""))
print("disabled keyboard dd delete")

<IPython.core.display.Javascript object>

disabled keyboard dd delete
