<a href="https://colab.research.google.com/github/KshitijShinde/Skill/blob/main/Skill_LCA_FINAL_COMBINE_DATASET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

📂 Load and Merge BoT-IoT Dataset CSV Files

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

# === Load and Merge CSVs ===
df1 = pd.read_csv(r"E:\bot iot\All features\UNSW_2018_IoT_Botnet_Full5pc_1.csv")
df2 = pd.read_csv(r"E:\bot iot\All features\UNSW_2018_IoT_Botnet_Full5pc_2.csv")
df3 = pd.read_csv(r"E:\bot iot\All features\UNSW_2018_IoT_Botnet_Full5pc_3.csv")
df4 = pd.read_csv(r"E:\bot iot\All features\UNSW_2018_IoT_Botnet_Full5pc_4.csv")
df = pd.concat([df1, df2, df3, df4], ignore_index=True).drop_duplicates()

# === Dataset Information ===
print("Dataset Information:")
df.info()  # Shows the data types, non-null counts, and memory usage

# === Preview of Dataset ===
print("\nFirst 5 Rows of the Dataset:")
print(df.head())  # Shows the first 5 rows

# === Preprocessing ===
target_column = 'attack'
df = df.select_dtypes(include=[np.number])
df = df.replace([np.inf, -np.inf], np.nan).dropna()
X = df.drop(target_column, axis=1)
y = df[target_column]
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Balance Data ===
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_scaled, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


📊 Class Distribution Before and After Undersampling


In [None]:
# === Visualization 1: Class Distribution ===
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(y, bins=30, color='orange', alpha=0.7)
plt.title('Before Undersampling')
plt.subplot(1, 2, 2)
plt.hist(y_resampled, bins=30, color='green', alpha=0.7)
plt.title('After Undersampling')
plt.tight_layout()
plt.show()


🔥 Correlation Heatmap of Features

In [None]:
# === Visualization 2: Correlation Heatmap ===
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df[target_column] = y
plt.figure(figsize=(15, 10))
sns.heatmap(X_scaled_df.corr(), cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


📦 Boxplot for Distribution of Sample Features

In [None]:
# 5. Boxplot for Feature Distributions (Sample for 5 features)
plt.figure(figsize=(15, 8))
sns.boxplot(data=X_scaled_df[['pkts', 'bytes', 'dur', 'mean', 'stddev']])
plt.title('Boxplot for Feature Distributions')
plt.show()

In [None]:
# === Visualization 4: Feature Distribution (Histograms for Selected Features) ===
selected_features = ['pkts', 'bytes', 'seq', 'dur', 'spkts']  # Modify with relevant features
plt.figure(figsize=(12, 8))
for i, feature in enumerate(selected_features, 1):
    plt.subplot(2, 3, i)
    plt.hist(X[feature], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    plt.title(f'{feature} Distribution')
plt.tight_layout()
plt.show()

📊 Class Distribution After Undersampling (Count Plot)

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x=y_resampled)  # For the resampled target variable
plt.title('Class Distribution (After Undersampling)')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

🌲 Embedded Feature Selection using Random Forest

In [None]:
# === Feature Importance (Embedded) ===
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
feat_importance = rf_model.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(range(len(feat_importance)), feat_importance)
plt.yticks(range(len(feat_importance)), X.columns)
plt.title('Random Forest Feature Importance')
plt.show()

top_n = 10
top_indices = np.argsort(feat_importance)[-top_n:]
top_features_embedded = X.columns[top_indices]
top_importances = feat_importance[top_indices]
plt.figure(figsize=(10, 6))
plt.barh(top_features_embedded, top_importances, color='darkorange')
plt.title(f'Top {top_n} Embedded Features')
plt.show()
print(f"Embedded Feature Selection - Top {top_n} Features:\n{top_features_embedded}")


✅ Feature Selection using SelectKBest (Filter Method)

In [None]:
# === Feature Selection: SelectKBest ===
selector = SelectKBest(score_func=f_classif, k=10)
X_kbest = selector.fit_transform(X_train, y_train)

# Get the selected features and their scores
kbest_features = X.columns[selector.get_support()]
kbest_scores = selector.scores_[selector.get_support()]

# Plot the SelectKBest feature scores
plt.figure(figsize=(10, 6))
plt.barh(kbest_features, kbest_scores, color='purple')
plt.title("SelectKBest Feature Scores")
plt.show()

# Display the selected features and their scores
print(f"Selected Features using SelectKBest:\n{kbest_features}")
print(f"Scores of Selected Features:\n{kbest_scores}")


🔍 Feature Selection using RFE (Wrapper Method)

In [None]:
# === Feature Selection: RFE ===
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
X_rfe = rfe.fit_transform(X_train, y_train)
selected_features_rfe = X.columns[rfe.support_]
print(f"Selected Features using RFE: {selected_features_rfe}")

# Plot the feature ranking
ranking = rfe.ranking_

# Sort the ranking and select top N features
sorted_idx = np.argsort(ranking)
top_features_rfe = X.columns[sorted_idx][:top_n]
top_ranking = ranking[sorted_idx][:top_n]

# Plot the RFE ranking
plt.figure(figsize=(10, 6))
plt.barh(top_features_rfe, top_ranking, color='teal')
plt.title('Top 10 Features Selected by RFE')
plt.xlabel('Ranking')
plt.ylabel('Feature')
plt.gca().invert_yaxis()  # Invert the y-axis so that the top feature is at the top
plt.show()


In [None]:
from sklearn.metrics import accuracy_score

# ========== Compare Feature Selection Methods ==========
models = {
    "Embedded": X_train[:, top_indices],  # top features from embedded method
    "SelectKBest": X_kbest,
    "RFE": X_rfe
}

accuracies = {}

for name, X_fs in models.items():
    clf = GradientBoostingClassifier(random_state=42)
    clf.fit(X_fs, y_train)
    if name == "Embedded":
        X_fs_test = X_test[:, top_indices]
    elif name == "SelectKBest":
        X_fs_test = selector.transform(X_test)
    elif name == "RFE":
        X_fs_test = rfe.transform(X_test)
    
    y_pred = clf.predict(X_fs_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc

# Plotting comparison
plt.figure(figsize=(8, 5))
plt.bar(accuracies.keys(), accuracies.values(), color=['darkorange', 'purple', 'teal'])
plt.title("Model Accuracy Based on Feature Selection Methods")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
for i, v in enumerate(accuracies.values()):
    plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontsize=12)
plt.tight_layout()
plt.show()


🧠 Ensemble Learning: Bagging Classifier with Decision Trees

In [None]:
# === Bagging Classifier ===
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)
print("\nBagging Model - Classification Report:")
print(classification_report(y_test, y_pred_bagging))
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_bagging), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Bagging")
plt.show()


🚀 Ensemble Learning: Gradient Boosting Classifier Evaluation


In [None]:
# === Boosting Classifier ===
boosting_model = GradientBoostingClassifier(random_state=42)
boosting_model.fit(X_train, y_train)
y_pred_boosting = boosting_model.predict(X_test)
print("\nBoosting Model - Classification Report:")
print(classification_report(y_test, y_pred_boosting))
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_boosting), annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Boosting")
plt.show()
