In [None]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Step 2: Load and Inspect the Dataset
# Load the dataset
data = pd.read_csv('your_dataset.csv')  # Replace with your actual file path

# Display the first few rows of the dataset
print(data.head())

# Display dataset information
print(data.info())

# Check for missing values
print(data.isnull().sum())

# Step 3: Data Preprocessing (Handling Missing Values & Selecting Numerical Features)
# Drop rows with missing values
data_cleaned = data.dropna()

# Select numerical features for anomaly detection
numerical_features = data_cleaned.select_dtypes(include=[np.number])

# Display statistical summary of numerical features
print(numerical_features.describe())

# Step 4: Feature Scaling (Optional, if features vary in scale)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numerical_features)

# Convert scaled data back into DataFrame
scaled_data = pd.DataFrame(scaled_features, columns=numerical_features.columns)

# Step 5: Apply Isolation Forest for Anomaly Detection
# Initialize the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model
iso_forest.fit(scaled_data)

# Predict anomalies
anomaly_labels = iso_forest.predict(scaled_data)

# Add the anomaly labels to the original data
data_cleaned['Anomaly'] = anomaly_labels

# Step 6: Visualize Anomalies
# Separate anomalies and normal data for visualization
anomalies = data_cleaned[data_cleaned['Anomaly'] == -1]
normal_data = data_cleaned[data_cleaned['Anomaly'] == 1]

# Plot anomalies vs normal data (adjust 'Feature1' and 'Feature2' to actual feature names)
plt.figure(figsize=(10, 6))
plt.scatter(normal_data['Feature1'], normal_data['Feature2'], c='blue', label='Normal')
plt.scatter(anomalies['Feature1'], anomalies['Feature2'], c='red', label='Anomaly')
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.title('Anomaly Detection using Isolation Forest')
plt.legend()
plt.show()

# Step 7: Evaluate the Results (if ground truth labels are available)
# Assuming you have true labels for anomalies (e.g., 'TrueLabel')
true_labels = data_cleaned['TrueLabel']  # Replace with your actual true label column

# Map Isolation Forest anomaly labels to match the true labels (1: Anomaly, 0: Normal)
predicted_labels = np.where(data_cleaned['Anomaly'] == -1, 1, 0)

# Generate classification report
print(classification_report(true_labels, predicted_labels))