In [None]:
!pip install pandas
!pip install imbalanced-learn
!pip install scikit-learn
!pip install gdown
import gdown

In [None]:
import pandas as pd
import os
dataset_paths = [
    'C:/Jupyter notebook/dataset_1.csv',
    'C:/Jupyter notebook/dataset_2.csv',
    'C:/Jupyter notebook/dataset_3.csv',
    'C:/Jupyter notebook/dataset_4.csv',
    'C:/Jupyter notebook/dataset_5.csv',
    'C:/Jupyter notebook/dataset_6.csv',
    'C:/Jupyter notebook/dataset_7.csv',
    'C:/Jupyter notebook/dataset_8.csv',
]

# Check if files exist
for path in dataset_paths:
    if not os.path.exists(path):
        print(f"File not found: {path}")

# If all files are confirmed to exist, read them
dataframes = [pd.read_csv(path) for path in dataset_paths if os.path.exists(path)]

# Combine all datasets into one dataframe
combined_dataset = pd.concat(dataframes, ignore_index=True)

In [None]:
combined_dataset.describe()

In [None]:
# Get an overview of the dataset
print(combined_dataset.head())
print(combined_dataset.info())  # Review structure, data types, and non-null counts
print(combined_dataset.describe())  # Summary statistics

In [None]:
# Load dataset and check for missing values, data types
missing_values = combined_dataset.isnull().sum()
print("Missing values in each feature:\n", missing_values)

# Checking data types and columns
print(combined_dataset.dtypes)

# Summary statistics
print(combined_dataset.describe())


In [None]:
print(combined_dataset.columns)


In [None]:
# Check the distribution of the target class (assuming 'Label' is the target column)
print(combined_dataset[' Label'].value_counts())

# Identifying the minority classes
minority_classes = combined_dataset[' Label'].value_counts()[combined_dataset[' Label'].value_counts() < 1000]
print("Minority attack classes:\n", minority_classes)


In [None]:
# Handle missing values (option to impute or drop)
# Dropping rows with missing values
combined_dataset = combined_dataset.dropna()

# Remove duplicate records
combined_dataset = combined_dataset.drop_duplicates()

# Checking if duplicates were removed
print("Data shape after removing duplicates:", combined_dataset.shape)


In [None]:
import numpy as np
import pandas as pd

# Automatically detect numerical columns
numerical_cols = combined_dataset.select_dtypes(include=['float64', 'int64']).columns

# Check for NaN values
print("NaN values in dataset:\n", combined_dataset[numerical_cols].isna().sum())

# Check for infinity values
print("Infinity values in dataset:\n", np.isinf(combined_dataset[numerical_cols]).sum())

# Fill NaN with the mean of each column
combined_dataset[numerical_cols] = combined_dataset[numerical_cols].fillna(combined_dataset[numerical_cols].mean())

# Replace infinity with a large finite number
combined_dataset[numerical_cols] = combined_dataset[numerical_cols].replace([np.inf, -np.inf], np.nan)
combined_dataset[numerical_cols] = combined_dataset[numerical_cols].fillna(combined_dataset[numerical_cols].mean())


In [None]:
# Prepare feature matrix X and target variable y
X = combined_dataset.drop('Label', axis=1)  # Drop target column
y = combined_dataset['Label']  # Target variable

# Encode the labels (target variable) using LabelEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode target labels

# Scale the features (if scaling is needed)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Standardize the feature set


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
smote = SMote(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_encoded)

# Check the new class distribution after SMOTE
new_class_distribution = pd.Series(y_resampled).value_counts()
print("New Class Distribution after SMOTE:\n", new_class_distribution)


In [None]:
from sklearn.model_selection import train_test_split

# Split the SMOTE-resampled data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print("Training and Test sets created successfully.")


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the SMOTE-resampled training data with all 80 features
rf_model.fit(X_train, y_train)

print("Random Forest model trained on all features.")


In [None]:
# Get the feature importance scores from the trained model
importances = rf_model.feature_importances_

# Create a DataFrame of feature importances
feature_importances_df = pd.DataFrame({'Feature': combined_dataset.columns[:-1],  # Exclude 'Label'
                                       'Importance': importances})

# Sort features by importance in descending order and select the top 20 features
top_20_features = feature_importances_df.sort_values(by='Importance', ascending=False).head(20)['Feature'].values
print("Top 20 most important features:\n", top_20_features)

# Create a new dataset with only the top 20 features
X_resampled_top_20 = pd.DataFrame(X_resampled, columns=combined_dataset.columns[:-1])[top_20_features].values
X_train_top_20, X_test_top_20, y_train_top_20, y_test_top_20 = train_test_split(X_resampled_top_20, y_resampled, test_size=0.2, random_state=42)


In [None]:
# Retrain the Random Forest model using only the top 20 most important features
rf_model_top_20 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_top_20.fit(X_train_top_20, y_train_top_20)

print("Random Forest model retrained using top 20 features.")


In [None]:
from sklearn.metrics import classification_report

# Predict on the test set using the Random Forest model trained with the top 20 features
y_pred_top_20 = rf_model_top_20.predict(X_test_top_20)

# Print classification report (Precision, Recall, F1-score)
print("Classification Report with Top 20 Features:\n", classification_report(y_test_top_20, y_pred_top_20))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Generate confusion matrix to evaluate true vs predicted classes
conf_matrix_top_20 = confusion_matrix(y_test_top_20, y_pred_top_20)
print("Confusion Matrix with Top 20 Features:\n", conf_matrix_top_20)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_top_20, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix with Top 20 Features')
plt.ylabel('Actual Class')
plt.xlabel('Predicted Class')
plt.show()


In [None]:
from sklearn.metrics import roc_auc_score

# Compute the ROC-AUC score for multi-class classification (One-vs-Rest)
roc_score_top_20 = roc_auc_score(y_test_top_20, rf_model_top_20.predict_proba(X_test_top_20), multi_class='ovr')
print(f"ROC-AUC Score with Top 20 Features: {roc_score_top_20:.4f}")
