In [1]:
import pandas as pd
import ipaddress
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [5]:
from google.colab import files

uploaded = files.upload()

Saving tvae_synthetic_train.csv to tvae_synthetic_train.csv


In [6]:
# Function to convert IP addresses to integers
def convert_ip_to_int(df, ip_column):
    df[ip_column] = df[ip_column].apply(lambda x: int(ipaddress.IPv4Address(x)))
    return df

In [7]:
# Load the datasets
real_train = pd.read_csv('real_train.csv')
ctgan_synthetic_train = pd.read_csv('ctgan_synthetic_train.csv')
gc_synthetic_train = pd.read_csv('gaussian_copula_synthetic_train.csv')
tvae_synthetic_train = pd.read_csv('tvae_synthetic_train.csv')
test = pd.read_csv('test.csv')

# Preprocess the Source IP addresses by converting them to integers
real_train = convert_ip_to_int(real_train, 'IPV4_SRC_ADDR')
ctgan_synthetic_train = convert_ip_to_int(ctgan_synthetic_train, 'IPV4_SRC_ADDR')
gc_synthetic_train = convert_ip_to_int(gc_synthetic_train, 'IPV4_SRC_ADDR')
tvae_synthetic_train = convert_ip_to_int(tvae_synthetic_train, 'IPV4_SRC_ADDR')
test = convert_ip_to_int(test, 'IPV4_SRC_ADDR')

# Preprocess the Destination IP addresses by converting them to integers
real_train = convert_ip_to_int(real_train, 'IPV4_DST_ADDR')
ctgan_synthetic_train = convert_ip_to_int(ctgan_synthetic_train, 'IPV4_DST_ADDR')
gc_synthetic_train = convert_ip_to_int(gc_synthetic_train, 'IPV4_DST_ADDR')
tvae_synthetic_train = convert_ip_to_int(tvae_synthetic_train, 'IPV4_DST_ADDR')
test = convert_ip_to_int(test, 'IPV4_DST_ADDR')

In [8]:
# Extracting features and labels from the datasets
feature_columns = [col for col in real_train.columns if col not in ['Label', 'Attack']]

# Preparing the data for training and testing
X_real_train = real_train[feature_columns]
y_real_train = real_train['Label']

X_ctgan_synthetic_train = ctgan_synthetic_train[feature_columns]
y_ctgan_synthetic_train = ctgan_synthetic_train['Label']

X_gc_synthetic_train = gc_synthetic_train[feature_columns]
y_gc_synthetic_train = gc_synthetic_train['Label']

X_tvae_synthetic_train = tvae_synthetic_train[feature_columns]
y_tvae_synthetic_train = tvae_synthetic_train['Label']

X_test = test[feature_columns]
y_test = test['Label']

In [9]:
# Initialize classifiers
real_clf = RandomForestClassifier(random_state=42)
ctgan_clf = RandomForestClassifier(random_state=42)
gc_clf = RandomForestClassifier(random_state=42)
tvae_clf = RandomForestClassifier(random_state=42)

# Train classifiers on real and synthetic data
real_clf.fit(X_real_train, y_real_train)
ctgan_clf.fit(X_ctgan_synthetic_train, y_ctgan_synthetic_train)
gc_clf.fit(X_gc_synthetic_train, y_gc_synthetic_train)
tvae_clf.fit(X_tvae_synthetic_train, y_tvae_synthetic_train)

In [11]:
# Make predictions on the test set
real_predictions = real_clf.predict(X_test)
ctgan_predictions = ctgan_clf.predict(X_test)
gc_predictions = gc_clf.predict(X_test)
tvae_predictions = tvae_clf.predict(X_test)

# Evaluate the models using classification metrics
real_report = classification_report(y_test, real_predictions, output_dict=True)
ctgan_report = classification_report(y_test, ctgan_predictions, output_dict=True)
gc_report = classification_report(y_test, gc_predictions, output_dict=True)
tvae_report = classification_report(y_test, tvae_predictions, output_dict=True)

# Collect accuracy scores
real_accuracy = accuracy_score(y_test, real_predictions)
ctgan_accuracy = accuracy_score(y_test, ctgan_predictions)
gc_accuracy = accuracy_score(y_test, gc_predictions)
tvae_accuracy = accuracy_score(y_test, tvae_predictions)

# Display results
results = {
    'Model': ['Real Data Classifier', 'CTGAN Data Classifier', 'Gaussian Copula Data Classifier', 'TVAE Data Classifier'],
    'Accuracy': [real_accuracy, ctgan_accuracy, gc_accuracy, tvae_accuracy],
    'Precision': [real_report['weighted avg']['precision'], ctgan_report['weighted avg']['precision'], gc_report['weighted avg']['precision'], tvae_report['weighted avg']['precision']],
    'Recall': [real_report['weighted avg']['recall'], ctgan_report['weighted avg']['recall'], gc_report['weighted avg']['recall'], tvae_report['weighted avg']['recall']],
    'F1-Score': [real_report['weighted avg']['f1-score'], ctgan_report['weighted avg']['f1-score'], gc_report['weighted avg']['f1-score'], tvae_report['weighted avg']['f1-score']]
}

# Convert results to DataFrame for display
results_df = pd.DataFrame(results)
print("\nClassifier Performance Comparison:")
print(results_df)

# Display confusion matrix for both models
print("\nConfusion Matrix - Real Data Classifier:")
print(confusion_matrix(y_test, real_predictions))

print("\nConfusion Matrix - CTGAN Data Classifier:")
print(confusion_matrix(y_test, ctgan_predictions))

print("\nConfusion Matrix - Gaussian Copula Data Classifier:")
print(confusion_matrix(y_test, gc_predictions))

print("\nConfusion Matrix - TVAE Data Classifier:")
print(confusion_matrix(y_test, tvae_predictions))


Classifier Performance Comparison:
                             Model  Accuracy  Precision  Recall  F1-Score
0             Real Data Classifier    0.9885   0.989341  0.9885  0.988812
1            CTGAN Data Classifier    0.9575   0.963649  0.9575  0.960197
2  Gaussian Copula Data Classifier    0.9630   0.964370  0.9630  0.945336
3             TVAE Data Classifier    0.9695   0.964269  0.9695  0.962725

Confusion Matrix - Real Data Classifier:
[[1909   16]
 [   7   68]]

Confusion Matrix - CTGAN Data Classifier:
[[1871   54]
 [  31   44]]

Confusion Matrix - Gaussian Copula Data Classifier:
[[1925    0]
 [  74    1]]

Confusion Matrix - TVAE Data Classifier:
[[1918    7]
 [  54   21]]
