In [None]:
# =========================
# Exploratory Data Analysis Notebook (single-cell version)
# =========================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import zipfile

# Set matplotlib style for better academic appearance
plt.style.use('seaborn-v0_8')

# Create plots directory if it doesn't exist
plots_dir = "plots"
os.makedirs(plots_dir, exist_ok=True)

def extract_csv_files_from_zip(zip_path):
    """
    Extract all CSV files from a ZIP archive and return a list of file paths.
    """
    csv_files = []
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for file in zip_ref.namelist():
                if file.endswith('.csv'):
                    temp_dir = "temp_extracted"
                    os.makedirs(temp_dir, exist_ok=True)
                    zip_ref.extract(file, temp_dir)
                    extracted_path = os.path.join(temp_dir, file)
                    csv_files.append(extracted_path)
                    print(f"Extracted: {file}")
    except Exception as e:
        print(f"Error extracting files from {zip_path}: {str(e)}")
    return csv_files

def perform_eda_on_dataset(df, dataset_name):
    """
    Perform comprehensive Exploratory Data Analysis (EDA) on a single dataset.
    """
    print(f"\n=== EDA for {dataset_name} ===")

    print(f"Dataset shape: {df.shape}")
    print(f"Column names: {list(df.columns)}")
    print(f"Data types:\n{df.dtypes}")

    print(f"\nMissing values:\n{df.isnull().sum()}")
    print(f"\nDuplicate rows: {df.duplicated().sum()}")

    numeric_columns = df.select_dtypes(include=[np.number]).columns
    if len(numeric_columns) > 0:
        print(f"\nDescriptive statistics:\n{df[numeric_columns].describe().round(2)}")

        plt.figure(figsize=(12, 4 * len(numeric_columns)))
        for i, col in enumerate(numeric_columns):
            plt.subplot(len(numeric_columns), 1, i + 1)
            sns.histplot(df[col], kde=True, bins=30)
            plt.title(f"Distribution of {col}")
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, f"{dataset_name}_distributions.png"))
        plt.close()

    categorical_columns = df.select_dtypes(include=["object", "category"]).columns
    if len(categorical_columns) > 0:
        plt.figure(figsize=(12, 3 * len(categorical_columns)))
        for i, col in enumerate(categorical_columns):
            plt.subplot(len(categorical_columns), 1, i + 1)
            df[col].value_counts().plot(kind="bar")
            plt.title(f"Count of {col}")
            plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, f"{dataset_name}_categorical_counts.png"))
        plt.close()

    if len(numeric_columns) > 1:
        plt.figure(figsize=(10, 8))
        corr = df[numeric_columns].corr()
        sns.heatmap(corr, annot=True, cmap="coolwarm", center=0)
        plt.title(f"Correlation Heatmap for {dataset_name}")
        plt.savefig(os.path.join(plots_dir, f"{dataset_name}_correlation_heatmap.png"))
        plt.close()

    print(f"\nSummary for {dataset_name}")
    print(f"Total missing values: {df.isnull().sum().sum()}")
    print(f"Total duplicates: {df.duplicated().sum()}")

def main():
    data_dir = "disertation_2026/datasets/AIOps-Challenge-2020-Data-main"

    csv_files = []
    zip_files = []

    for file in os.listdir(data_dir):
        full_path = os.path.join(data_dir, file)
        if file.endswith(".csv"):
            csv_files.append(full_path)
        elif file.endswith(".zip"):
            zip_files.append(full_path)

    for zip_file in zip_files:
        csv_files.extend(extract_csv_files_from_zip(zip_file))

    if not csv_files:
        print("No CSV files found.")
        return

    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            dataset_name = Path(csv_file).stem
            perform_eda_on_dataset(df, dataset_name)
            print(f"\n=== Completed EDA for {dataset_name} ===\n")
        except Exception as e:
            print(f"Error processing {csv_file}: {e}")

main()
