In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [10]:
#XGBoost:

# Load the data
df = pd.read_csv('C:/Users/kashi/OneDrive/Desktop/cybersecurity_attacks.csv')

# Separate features (X) and labels (y)
X = df.drop('Attack Type', axis=1)  # Assuming 'attack type' is the label column
y = df['Attack Type']

# Encode categorical labels to numeric format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Use OneHotEncoder to convert categorical columns to numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply the preprocessor to the training data
X_train = preprocessor.fit_transform(X_train)

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data only to avoid data leakage
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Instantiate XGBoost Classifier
# Set the appropriate hyperparameters based on your dataset and preferences
model = XGBClassifier(n_estimators=100, max_depth=3, random_state=42, n_jobs=-1)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)

# Apply the same preprocessing to the test data
X_test = preprocessor.transform(X_test)

# Predict on the test data
y_pred = model.predict(X_test)

# Inverse transform the labels to get back the original categorical labels
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.329

Classification Report:
               precision    recall  f1-score   support

        DDoS       0.33      0.68      0.44      2636
   Intrusion       0.31      0.14      0.20      2721
     Malware       0.35      0.17      0.23      2643

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.29      8000
weighted avg       0.33      0.33      0.29      8000



In [16]:
#XGBoost 2: 	BEST IN XG

# Load the data
df = pd.read_csv('C:/Users/kashi/OneDrive/Desktop/cybersecurity_attacks.csv')

# Separate features (X) and labels (y)
X = df.drop('Attack Type', axis=1)  # Assuming 'attack type' is the label column
y = df['Attack Type']

# Encode categorical labels to numeric format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Use OneHotEncoder to convert categorical columns to numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply the preprocessor to the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Apply TruncatedSVD to the transformed training data
svd = TruncatedSVD(n_components=50, random_state=42)  # Set the desired number of components
X_train_svd = svd.fit_transform(X_train_transformed)

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data only to avoid data leakage
X_resampled, y_resampled = smote.fit_resample(X_train_svd, y_train)

# Instantiate XGBoost Classifier
model = XGBClassifier(n_estimators=1000, max_depth=4, random_state=42, n_jobs=-1)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)

# Apply the same preprocessing to the test data
X_test_transformed = preprocessor.transform(X_test)

# Apply TruncatedSVD to the transformed test data
X_test_svd = svd.transform(X_test_transformed)

# Predict on the test data
y_pred = model.predict(X_test_svd)

# Inverse transform the labels to get back the original categorical labels
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_original, y_pred_original))
print("\nClassification Report:\n", classification_report(y_test_original, y_pred_original, target_names=label_encoder.classes_))



Accuracy: 0.33575

Classification Report:
               precision    recall  f1-score   support

        DDoS       0.33      0.34      0.34      2636
   Intrusion       0.35      0.37      0.36      2721
     Malware       0.32      0.30      0.31      2643

    accuracy                           0.34      8000
   macro avg       0.34      0.34      0.34      8000
weighted avg       0.34      0.34      0.34      8000



In [15]:
#XGBoost 3:

# Load the data
df = pd.read_csv('C:/Users/kashi/OneDrive/Desktop/cybersecurity_attacks.csv')

# Separate features (X) and labels (y)
X = df.drop('Attack Type', axis=1)  # Assuming 'attack type' is the label column
y = df['Attack Type']

# Encode categorical labels to numeric format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Use OneHotEncoder to convert categorical columns to numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply the preprocessor to the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Apply TruncatedSVD to the transformed training data
svd = TruncatedSVD(n_components=50, random_state=42)  # Set the desired number of components
X_train_svd = svd.fit_transform(X_train_transformed)

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data only to avoid data leakage
X_resampled, y_resampled = smote.fit_resample(X_train_svd, y_train)


# Experiment with different hyperparameter values
model = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.00001,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    random_state=42,
    n_jobs=-1
)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)


# Apply the same preprocessing to the test data
X_test_transformed = preprocessor.transform(X_test)

# Apply TruncatedSVD to the transformed test data
X_test_svd = svd.transform(X_test_transformed)

# Predict on the test data
y_pred = model.predict(X_test_svd)

# Inverse transform the labels to get back the original categorical labels
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_original, y_pred_original))
print("\nClassification Report:\n", classification_report(y_test_original, y_pred_original, target_names=label_encoder.classes_))



Accuracy: 0.33375

Classification Report:
               precision    recall  f1-score   support

        DDoS       0.33      0.33      0.33      2636
   Intrusion       0.34      0.44      0.39      2721
     Malware       0.33      0.22      0.27      2643

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.33      8000
weighted avg       0.33      0.33      0.33      8000



In [None]:
#RandomForest:

# Load the data
df = pd.read_csv('C:/Users/Sara Murtaza/Desktop/cybersecurity_attacks.csv')

# Separate features (X) and labels (y)
X = df.drop('Attack Type', axis=1)  # Assuming 'attack type' is the label column
y = df['Attack Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Use OneHotEncoder to convert categorical columns to numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Apply the preprocessor to the training data
X_train = preprocessor.fit_transform(X_train)

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data only to avoid data leakage
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Instantiate a Random Forest Classifier
# Set n_estimators to the number of trees you want in the forest
# Set max_depth to control the maximum depth of each tree
# Set n_jobs to -1 to use all available cores
model = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=42, n_jobs=-1)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)

# Apply the same preprocessing to the test data
X_test = preprocessor.transform(X_test)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
#RandomForest 2:	BEST OVERALL

# Load the data
df = pd.read_csv('C:/Users/kashi/OneDrive/Desktop/cybersecurity_attacks.csv')

# Separate features (X) and labels (y)
X = df.drop('Attack Type', axis=1)  # Assuming 'attack type' is the label column
y = df['Attack Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Concatenate training and test data
X_combined = pd.concat([X_train, X_test])

# Use OneHotEncoder to convert categorical columns to numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


# Apply the preprocessor to the combined data
X_combined_transformed = preprocessor.fit_transform(X_combined)

# Split the combined data back into training and test sets
X_train_transformed = X_combined_transformed[:len(X_train)]
X_test_transformed = X_combined_transformed[len(X_train):]

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data only to avoid data leakage
X_resampled, y_resampled = smote.fit_resample(X_train_transformed, y_train)

# Instantiate a Random Forest Classifier
model = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=42, n_jobs=-1)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)

# Apply the same preprocessing to the test data
# X_test_transformed is already transformed during the concatenation
# No need to transform it again

# Predict on the test data
y_pred = model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
#RandomForest 3:

# Load the data
df = pd.read_csv('C:/Users/kashi/OneDrive/Desktop/cybersecurity_attacks.csv')

# Separate features (X) and labels (y)
X = df.drop('Attack Type', axis=1)  # Assuming 'attack type' is the label column
y = df['Attack Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Concatenate training and test data
X_combined = pd.concat([X_train, X_test])

# Use OneHotEncoder to convert categorical columns to numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply the preprocessor to the combined data
X_combined_transformed = preprocessor.fit_transform(X_combined)

# Split the combined data back into training and test sets
X_train_transformed = X_combined_transformed[:len(X_train)]
X_test_transformed = X_combined_transformed[len(X_train):]

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data only to avoid data leakage
X_resampled, y_resampled = smote.fit_resample(X_train_transformed, y_train)

from sklearn.svm import SVC

# Instantiate a Support Vector Machine Classifier
model = SVC(kernel='rbf', random_state=42)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)
# Instantiate a Random Forest Classifier
model = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=42, n_jobs=-1)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)
# Apply the same preprocessing to the test data
# X_test_transformed is already transformed during the concatenation
# No need to transform it again

# Predict on the test data
y_pred = model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))