In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ----------------------------
# 1. Load dataset
# ----------------------------
def load_dataset(csv_path):
    try:
        df = pd.read_csv(csv_path)
        print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
        return df
    except FileNotFoundError:
        print("❌ Error: File not found. Check your path.")
    except Exception as e:
        print(f"❌ Error while loading dataset: {e}")


# ----------------------------
# 2. Inspect dataset
# ----------------------------
def inspect_dataset(df):
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nData types:")
    print(df.dtypes)
    print("\nMissing values:")
    print(df.isnull().sum())


# ----------------------------
# 3. Clean dataset
# ----------------------------
def clean_missing(df, strategy="drop"):
    if strategy == "drop":
        df_clean = df.dropna()
    elif strategy == "fill_mean":
        df_clean = df.fillna(df.mean(numeric_only=True))
    elif strategy == "fill_median":
        df_clean = df.fillna(df.median(numeric_only=True))
    else:
        df_clean = df.copy()
    print("\nMissing values handled.")
    return df_clean


# ----------------------------
# 4. Basic analysis
# ----------------------------
def analyze_data(df):
    print("\nDescriptive statistics:")
    print(df.describe())


def group_analysis(df, cat_col, num_col):
    if cat_col in df.columns and num_col in df.columns:
        grouped = df.groupby(cat_col)[num_col].mean()
        print(f"\nAverage {num_col} per {cat_col}:")
        print(grouped)
        return grouped
    else:
        print("\n⚠️ Group analysis skipped (check column names).")
        return None


# ----------------------------
# 5. Visualizations
# ----------------------------
def make_plots(df, num_cols, cat_col=None):
    # Line chart (first numeric column over index)
    if num_cols:
        plt.figure(figsize=(8, 5))
        df[num_cols[0]].plot(kind="line", title=f"Line Chart of {num_cols[0]}")
        plt.xlabel("Index")
        plt.ylabel(num_cols[0])
        plt.show()

    # Bar chart (grouped average if cat_col provided)
    if cat_col and num_cols:
        plt.figure(figsize=(8, 5))
        sns.barplot(x=cat_col, y=num_cols[0], data=df, estimator="mean", errorbar=None)
        plt.title(f"Bar Chart: Average {num_cols[0]} per {cat_col}")
        plt.xticks(rotation=45)
        plt.show()

    # Histogram (second numeric column if exists)
    if len(num_cols) > 1:
        plt.figure(figsize=(8, 5))
        sns.histplot(df[num_cols[1]], kde=True)
        plt.title(f"Histogram of {num_cols[1]}")
        plt.show()

    # Scatter plot (first two numeric columns if exist)
    if len(num_cols) > 1:
        plt.figure(figsize=(8, 5))
        sns.scatterplot(x=num_cols[0], y=num_cols[1], data=df)
        plt.title(f"Scatter Plot: {num_cols[0]} vs {num_cols[1]}")
        plt.show()


# ----------------------------
# 6. Main script
# ----------------------------
if __name__ == "__main__":
    # ✅ Change this to your dataset CSV (not .zip)
    csv_path = r"C:\Users\Admin\Downloads\archive\diabetes_binary_health_indicators_BRFSS2015.csv"

    # Load + inspect
    df = load_dataset(csv_path)
    if df is not None:
        inspect_dataset(df)

        # Clean
        df_clean = clean_missing(df, strategy="fill_median")

        # Analysis
        analyze_data(df_clean)

        # Auto-detect numeric + categorical columns
        num_cols = df_clean.select_dtypes(include="number").columns.tolist()
        cat_cols = df_clean.select_dtypes(exclude="number").columns.tolist()
        print("\nDetected numeric columns:", num_cols)
        print("Detected categorical columns:", cat_cols)

        # Grouping (if at least 1 categorical + numeric column exist)
        if num_cols and cat_cols:
            group_analysis(df_clean, cat_cols[0], num_cols[0])

        # Visualizations
        make_plots(df_clean, num_cols, cat_cols[0] if cat_cols else None)

        # Save cleaned data + summary
        df_clean.to_csv("cleaned_dataset.csv", index=False)
        df_clean.describe().to_csv("dataset_summary.csv")
        print("\n✅ Cleaned data and summary saved to current folder.")
