## Isolation Forest for Anomaly Detection
**Objective**: Understand and apply the Isolation Forest algorithm to identify anomalies in datasets.

### Task: Anomaly Detection in Network Traffic
**Steps**:
1. Extract Features from Dataset:
    - Load `network_traffic.csv` .
2. Isolation Forest Model
3. Display Anomalies

In [None]:
# write your code from here
import pandas as pd
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns

def detect_anomalies_isolation_forest(df, contamination=0.05):
    """
    Detects anomalies in a network traffic dataset using the Isolation Forest algorithm.

    Args:
        df (pd.DataFrame): The input DataFrame containing network traffic data.
        contamination (float, optional): The proportion of outliers in the data set.
            Defaults to 0.05.

    Returns:
        pd.DataFrame: The input DataFrame with an added 'anomaly' column,
            where 1 indicates an anomaly and 0 indicates a normal data point.
            Returns an empty DataFrame if an error occurs.
    """
    if df.empty:
        raise ValueError("Input DataFrame is empty.")

    if not all(isinstance(col, (int, float)) for col in df.columns):
        raise ValueError("All columns must be numeric.")

    try:
        # Apply Isolation Forest
        model = IsolationForest(contamination=contamination, random_state=42)
        model.fit(df)
        anomaly_labels = model.predict(df)

        # Convert anomaly labels to 0 and 1
        anomaly_labels = [0 if label == 1 else 1 for label in anomaly_labels]

        df['anomaly'] = anomaly_labels
        return df
    except Exception as e:
        print(f"Error during anomaly detection: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error


def visualize_anomalies(df, x_col, y_col, title="Anomaly Detection using Isolation Forest"):
    """
    Visualizes anomalies detected by the Isolation Forest algorithm.

    Args:
        df (pd.DataFrame): The DataFrame containing the data and the 'anomaly' column.
        x_col (str): The name of the column to be used for the x-axis.
        y_col (str): The name of the column to be used for the y-axis.
        title (str, optional): Title of the plot. Defaults to
            "Anomaly Detection using Isolation Forest".
    """
    if df.empty:
        print("Warning: Empty DataFrame. No anomalies to visualize.")
        return

    if 'anomaly' not in df.columns:
        raise KeyError("The DataFrame must contain an 'anomaly' column.")

    if not all(col in df.columns for col in [x_col, y_col]):
        raise KeyError(
            f"Columns '{x_col}' and '{y_col}' must be present in the DataFrame."
        )

    # Create the scatter plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x=df[x_col], y=df[y_col], hue=df['anomaly'], palette={0: 'blue', 1: 'red'}
    )
    plt.title(title)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.legend(title='Anomaly', labels=['Normal', 'Anomaly'])
    plt.show()



if __name__ == "__main__":
    # 1. Load Dataset
    df = pd.DataFrame() # Initialize df
    try:
        df = pd.read_csv("network_traffic.csv")  # Load the dataset
    except FileNotFoundError:
        print("Error: 'network_traffic.csv' not found. Please ensure the file is in the correct directory.")
        exit()
    except Exception as e:
        print(f"Error loading the dataset: {e}")
        exit()

    # Check if the dataframe is empty
    if df.empty:
        print("Error: The dataframe is empty after loading the dataset.  Check the file path and content.")
        exit()

    # Display the first few rows of the dataframe
    print("First 5 rows of the dataframe:")
    print(df.head())

    # Get information about the columns in the dataframe
    print("\nColumn information:")
    print(df.info())

    # 2. Feature Selection
    # Select numerical features for anomaly detection.  You might need to adjust this
    # based on your dataset.  This example assumes these columns are relevant.
    numerical_features = ['Total Packets', 'Total Bytes', 'Avg Packet Size', 'Packet Rate', 'Byte Rate']
    df_selected = pd.DataFrame() # Initialize df_selected
    try:
        df_selected = df[numerical_features]
    except KeyError as e:
        print(f"Error:  {e}.  Please ensure the columns are present in the DataFrame.")
        exit()
    
    # Check if the selected dataframe is empty
    if df_selected.empty:
        print("Error: No numerical features selected.  Anomaly detection cannot be performed.")
        exit()

    # 3. Apply Isolation Forest
    df_with_anomalies = detect_anomalies_isolation_forest(df_selected.copy(), contamination=0.1)

    if not df_with_anomalies.empty:
        # Add original non-numeric columns back to the dataframe
        df_final = pd.concat([df, df_with_anomalies['anomaly']], axis=1)
        
        # 4. Display Anomalies
        print("\nDataFrame with Anomaly Labels:")
        print(df_final.head())

        # Visualize the anomalies (choose appropriate columns for visualization)
        visualize_anomalies(df_final, 'Total Packets', 'Total Bytes', title='Anomalies in Network Traffic Data')
        visualize_anomalies(df_final, 'Packet Rate', 'Byte Rate', title='Anomalies in Network Traffic Data')
    else:
        print("No anomalies detected or error occurred during processing.")


Error: 'network_traffic.csv' not found. Please ensure the file is in the correct directory.
Error: The dataframe is empty after loading the dataset.  Check the file path and content.
First 5 rows of the dataframe:
Empty DataFrame
Columns: []
Index: []

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
None
Error:  "None of [Index(['Total Packets', 'Total Bytes', 'Avg Packet Size', 'Packet Rate',\n       'Byte Rate'],\n      dtype='object')] are in the [columns]".  Please ensure the columns are present in the DataFrame.
Error: No numerical features selected.  Anomaly detection cannot be performed.


ValueError: Input DataFrame is empty.

: 