# Exploratory Data Analysis (EDA) for AIOps-Challenge-2020-Data-main Dataset

## Overview

This notebook provides a comprehensive explanation of the Exploratory Data Analysis (EDA) performed on the AIOps-Challenge-2020-Data-main dataset. The EDA script automates the process of analyzing all CSV files in the dataset, including those inside ZIP archives, and generates detailed reports and visualizations.

## Why EDA is Important

Exploratory Data Analysis is a critical step in any data science project. It helps us understand the data, identify patterns, detect anomalies, and make informed decisions about further analysis.

- Understand the structure and characteristics of the data
- Identify missing values and potential data quality issues
- Discover relationships between variables
- Prepare the data for machine learning models

## How the EDA Script Works

For each dataset, the script performs:
1. Dataset information and schema inspection
2. Data quality checks (missing values, duplicates)
3. Descriptive statistics for numeric features
4. Automated visualization generation

## Conclusion

This notebook provides a reproducible and automated framework for exploratory data analysis of the AIOps-Challenge-2020 dataset. The generated insights and plots serve as a foundation for subsequent modeling and experimentation.


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import zipfile

plt.style.use('seaborn-v0_8')

plots_dir = "plots"
os.makedirs(plots_dir, exist_ok=True)

def extract_csv_files_from_zip(zip_path):
    csv_files = []
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for file in zip_ref.namelist():
                if file.endswith('.csv'):
                    temp_dir = "temp_extracted"
                    os.makedirs(temp_dir, exist_ok=True)
                    zip_ref.extract(file, temp_dir)
                    csv_files.append(os.path.join(temp_dir, file))
                    print(f"Extracted: {file}")
    except Exception as e:
        print(f"Error extracting files from {zip_path}: {e}")
    return csv_files

def perform_eda_on_dataset(df, dataset_name):
    print(f"\n=== EDA for {dataset_name} ===")
    print(f"Dataset shape: {df.shape}")
    print(f"Column names: {list(df.columns)}")
    print(f"Data types:\n{df.dtypes}")

    print(f"\nMissing values:\n{df.isnull().sum()}")
    print(f"Duplicate rows: {df.duplicated().sum()}")

    numeric_columns = df.select_dtypes(include=[np.number]).columns
    if len(numeric_columns) > 0:
        print(df[numeric_columns].describe().round(2))

        plt.figure(figsize=(12, 4 * len(numeric_columns)))
        for i, col in enumerate(numeric_columns):
            plt.subplot(len(numeric_columns), 1, i + 1)
            sns.histplot(df[col], kde=True, bins=30)
            plt.title(col)
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, f"{dataset_name}_distributions.png"))
        plt.close()

    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_columns) > 0:
        plt.figure(figsize=(12, 3 * len(categorical_columns)))
        for i, col in enumerate(categorical_columns):
            plt.subplot(len(categorical_columns), 1, i + 1)
            df[col].value_counts().plot(kind='bar')
            plt.title(col)
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, f"{dataset_name}_categorical_counts.png"))
        plt.close()

    if len(numeric_columns) > 1:
        plt.figure(figsize=(10, 8))
        sns.heatmap(df[numeric_columns].corr(), annot=True, cmap='coolwarm', center=0)
        plt.title(dataset_name)
        plt.savefig(os.path.join(plots_dir, f"{dataset_name}_correlation_heatmap.png"))
        plt.close()

def main():
    data_dir = "disertation_2026/datasets/AIOps-Challenge-2020-Data-main"

    csv_files = []
    zip_files = []

    for file in os.listdir(data_dir):
        path = os.path.join(data_dir, file)
        if file.endswith('.csv'):
            csv_files.append(path)
        elif file.endswith('.zip'):
            zip_files.append(path)

    for zip_file in zip_files:
        csv_files.extend(extract_csv_files_from_zip(zip_file))

    if not csv_files:
        print("No CSV files found.")
        return

    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            perform_eda_on_dataset(df, Path(csv_file).stem)
            print(f"\n=== Completed EDA for {Path(csv_file).stem} ===\n")
        except Exception as e:
            print(f"Error processing {csv_file}: {e}")

main()
