# DataBro Multi-Dataset Analysis Demo
This notebook demonstrates DataBro's capabilities with multiple popular datasets:
1. Iris Dataset (Classification)
2. Wine Dataset (Classification)
3. Breast Cancer Dataset (Binary Classification)
4. Diabetes Dataset (Regression)

In [None]:
# Import required libraries
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_diabetes
import pandas as pd
import numpy as np

# Import DataBro components
from databro import DataLoader, DataPreprocessor, DataVisualizer, DataSummarizer

# Initialize DataBro components
loader = DataLoader()
preprocessor = DataPreprocessor()
visualizer = DataVisualizer()
summarizer = DataSummarizer()

## 1. Load All Datasets

In [None]:
# Function to convert sklearn dataset to pandas DataFrame
def create_dataset(loader_func, name):
    data = loader_func()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['target'] = data.target
    df.to_csv(f'{name}_data.csv', index=False)
    return df

# Create all datasets
iris_df = create_dataset(load_iris, 'iris')
wine_df = create_dataset(load_wine, 'wine')
cancer_df = create_dataset(load_breast_cancer, 'cancer')
diabetes_df = create_dataset(load_diabetes, 'diabetes')

datasets = {
    'Iris': iris_df,
    'Wine': wine_df,
    'Breast Cancer': cancer_df,
    'Diabetes': diabetes_df
}

## 2. Dataset Overview

In [None]:
# Display basic information for each dataset
for name, df in datasets.items():
    print(f"\n=== {name} Dataset ===")
    print(f"Shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head(2))
    print("\nFeature names:")
    print(df.columns.tolist())

## 3. Data Preprocessing Examples

In [None]:
# Process each dataset
processed_datasets = {}

for name, df in datasets.items():
    print(f"\nProcessing {name} dataset...")
    
    # Scale features
    features = [col for col in df.columns if col != 'target']
    scaled_df = preprocessor.scale_data(df, method='standard', columns=features)
    
    # Store processed dataset
    processed_datasets[name] = scaled_df
    
    print(f"Shape after processing: {scaled_df.shape}")
    print("\nSample of processed data:")
    print(scaled_df.head(2))

## 4. Visualization Examples

In [None]:
# Create visualizations for each dataset
for name, df in processed_datasets.items():
    print(f"\n=== Visualizations for {name} Dataset ===\n")
    
    # 1. Correlation Heatmap
    print("Correlation Heatmap:")
    visualizer.plot_heatmap(df)
    
    # 2. Distribution of first feature
    first_feature = df.columns[0]
    print(f"\nDistribution of {first_feature}:")
    visualizer.plot_histogram(df, first_feature, interactive=True)
    
    # 3. Scatter plot of first two features
    if len(df.columns) > 2:
        print(f"\nScatter plot of first two features:")
        visualizer.plot_scatter(df, 
                               x=df.columns[0], 
                               y=df.columns[1], 
                               hue='target',
                               interactive=True)
    
    # 4. Box plot of first feature by target
    print(f"\nBox plot of {first_feature} by target:")
    visualizer.plot_box(df, column=first_feature, by='target', interactive=True)

## 5. Data Summarization Examples

In [None]:
# Generate summaries for each dataset
for name, df in processed_datasets.items():
    print(f"\n=== Summary for {name} Dataset ===\n")
    
    # Basic statistics
    print("Basic Statistics:")
    stats = summarizer.get_basic_stats(df)
    print(stats)
    
    # Correlation summary
    print("\nStrong Correlations (threshold=0.5):")
    corr_summary = summarizer.get_correlation_summary(df, threshold=0.5)
    print(corr_summary)
    
    # Generate and save full report
    report_path = f'{name.lower()}_report.txt'
    summarizer.generate_report(df, report_path)
    print(f"\nFull report saved to: {report_path}")

## 6. Comparative Analysis

In [None]:
# Compare dataset characteristics
comparison_data = {
    'Dataset': [],
    'Samples': [],
    'Features': [],
    'Target Classes': [],
    'Missing Values': []
}

for name, df in datasets.items():
    comparison_data['Dataset'].append(name)
    comparison_data['Samples'].append(len(df))
    comparison_data['Features'].append(len(df.columns) - 1)  # Excluding target
    comparison_data['Target Classes'].append(len(df['target'].unique()))
    comparison_data['Missing Values'].append(df.isnull().sum().sum())

comparison_df = pd.DataFrame(comparison_data)
print("Dataset Comparison:")
print(comparison_df)

## 7. Save Processed Datasets

In [None]:
# Save all processed datasets
for name, df in processed_datasets.items():
    output_path = f'{name.lower()}_processed.csv'
    df.to_csv(output_path, index=False)
    print(f"Saved processed {name} dataset to: {output_path}")