# Data Exploration Notebook

This notebook provides a comprehensive framework for exploring your dataset. Follow the sections below to get a thorough understanding of your data.

## Table of Contents
1. [Data Loading](#data-loading)
2. [Basic Information](#basic-information)
3. [Missing Values Analysis](#missing-values-analysis)
4. [Data Types & Distributions](#data-types--distributions)
5. [Statistical Summary](#statistical-summary)
6. [Initial Visualizations](#initial-visualizations)

## Setup and Imports

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# EDA toolkit imports
import sys
sys.path.append('../src')
sys.path.append('../plots')

from data_loader import DataLoader
from statistical_analysis import StatisticalAnalyzer
from histogram import create_histogram, create_multiple_histograms
from box_plot import create_box_plot
from correlation_matrix import create_correlation_heatmap
from utils import PlotConfig

# Configure plotting
plot_config = PlotConfig()
plot_config.set_style()

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("Setup complete!")

## Data Loading

Load your dataset using the EDA toolkit's DataLoader class.

In [None]:
# Initialize data loader
loader = DataLoader()

# Load your data (replace with your file path)
# For CSV files:
# data = loader.load_csv('../data/raw/your_data.csv')

# For Excel files:
# data = loader.load_excel('../data/raw/your_data.xlsx')

# For demonstration, let's create sample data
np.random.seed(42)
sample_data = {
    'age': np.random.randint(18, 80, 1000),
    'income': np.random.normal(50000, 15000, 1000),
    'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 1000),
    'experience': np.random.randint(0, 40, 1000),
    'salary': np.random.normal(60000, 20000, 1000),
    'category': np.random.choice(['A', 'B', 'C'], 1000)
}

# Add some missing values
sample_data['income'][np.random.choice(1000, 50, replace=False)] = np.nan
sample_data['salary'][np.random.choice(1000, 30, replace=False)] = np.nan

data = pd.DataFrame(sample_data)

print(f"Data loaded successfully!")
print(f"Shape: {data.shape}")

## Basic Information

Get an overview of the dataset structure and basic properties.

In [None]:
# Dataset shape
print(f"Dataset Shape: {data.shape}")
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")
print("\n" + "="*50 + "\n")

# Column information
print("Column Information:")
print(data.info())
print("\n" + "="*50 + "\n")

# First few rows
print("First 5 rows:")
display(data.head())

print("\n" + "="*50 + "\n")

# Data types
print("Data Types:")
print(data.dtypes)

print("\n" + "="*50 + "\n")

# Memory usage
print("Memory Usage:")
print(f"Total memory usage: {data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## Missing Values Analysis

Analyze missing values in the dataset.

In [None]:
# Missing values summary
missing_summary = pd.DataFrame({
    'Missing_Count': data.isnull().sum(),
    'Missing_Percentage': (data.isnull().sum() / len(data) * 100).round(2)
}).sort_values('Missing_Count', ascending=False)

print("Missing Values Summary:")
display(missing_summary[missing_summary['Missing_Count'] > 0])

# Visualize missing values
if missing_summary['Missing_Count'].sum() > 0:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Missing values heatmap
    sns.heatmap(data.isnull(), ax=ax1, cbar=True, cmap='viridis')
    ax1.set_title('Missing Values Heatmap')
    
    # Missing values bar plot
    missing_cols = missing_summary[missing_summary['Missing_Count'] > 0]
    if not missing_cols.empty:
        missing_cols['Missing_Percentage'].plot(kind='bar', ax=ax2)
        ax2.set_title('Missing Values Percentage by Column')
        ax2.set_ylabel('Percentage (%)')
        plt.setp(ax2.get_xticklabels(), rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")

## Data Types & Distributions

Analyze data types and explore distributions of different variable types.

In [None]:
# Separate columns by type
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_columns = data.select_dtypes(include=['datetime64']).columns.tolist()

print(f"Numeric columns ({len(numeric_columns)}): {numeric_columns}")
print(f"Categorical columns ({len(categorical_columns)}): {categorical_columns}")
print(f"Datetime columns ({len(datetime_columns)}): {datetime_columns}")

print("\n" + "="*50 + "\n")

# Unique values in categorical columns
if categorical_columns:
    print("Unique values in categorical columns:")
    for col in categorical_columns:
        unique_count = data[col].nunique()
        print(f"{col}: {unique_count} unique values")
        if unique_count <= 10:
            print(f"  Values: {data[col].unique()}")
        print()

print("\n" + "="*50 + "\n")

# Cardinality analysis
cardinality_df = pd.DataFrame({
    'Column': data.columns,
    'Unique_Count': [data[col].nunique() for col in data.columns],
    'Unique_Percentage': [data[col].nunique() / len(data) * 100 for col in data.columns]
}).round(2)

print("Cardinality Analysis:")
display(cardinality_df.sort_values('Unique_Percentage', ascending=False))

## Statistical Summary

Generate comprehensive statistical summaries using the EDA toolkit.

In [None]:
# Initialize statistical analyzer
analyzer = StatisticalAnalyzer(data)

# Generate comprehensive statistics
stats_summary = analyzer.describe_all()

# Display numeric summary
if 'numeric_summary' in stats_summary:
    print("Numeric Variables Summary:")
    display(stats_summary['numeric_summary'])
    
    print("\nExtended Numeric Statistics:")
    display(stats_summary['numeric_extended'])

print("\n" + "="*50 + "\n")

# Display categorical summary
if 'categorical_summary' in stats_summary:
    print("Categorical Variables Summary:")
    display(stats_summary['categorical_summary'])

print("\n" + "="*50 + "\n")

# Distribution analysis
if numeric_columns:
    print("Distribution Analysis:")
    dist_analysis = analyzer.distribution_analysis(numeric_columns)
    
    dist_df = pd.DataFrame(dist_analysis).T
    display(dist_df[['mean', 'median', 'std', 'skewness', 'kurtosis', 'shape']].round(3))

## Initial Visualizations

Create initial visualizations to understand data distributions and relationships.

In [None]:
# Histograms for numeric variables
if len(numeric_columns) > 0:
    print("Distribution of Numeric Variables:")
    
    if len(numeric_columns) <= 4:
        # Create individual histograms
        for col in numeric_columns:
            create_histogram(data, col, save_path=f'../figures/exploratory/{col}_histogram.png')
            plt.show()
    else:
        # Create multiple histograms in grid
        create_multiple_histograms(data, numeric_columns, 
                                 save_path='../figures/exploratory/numeric_distributions.png')
        plt.show()

In [None]:
# Box plots for numeric variables
if len(numeric_columns) > 1:
    print("Box Plots of Numeric Variables:")
    create_box_plot(data, numeric_columns, 
                   save_path='../figures/exploratory/numeric_boxplots.png')
    plt.show()

In [None]:
# Correlation analysis
if len(numeric_columns) > 1:
    print("Correlation Analysis:")
    
    # Correlation matrix
    create_correlation_heatmap(data, numeric_columns,
                              save_path='../figures/exploratory/correlation_matrix.png')
    plt.show()
    
    # Statistical correlation analysis
    corr_analysis = analyzer.correlation_analysis()
    
    if 'high_correlations' in corr_analysis and not corr_analysis['high_correlations'].empty:
        print("\nHigh Correlations (|r| > 0.7):")
        display(corr_analysis['high_correlations'].round(3))
    else:
        print("\nNo high correlations found (|r| > 0.7)")

In [None]:
# Categorical variable analysis
if categorical_columns:
    print("Categorical Variables Analysis:")
    
    for col in categorical_columns:
        print(f"\n{col} - Value Counts:")
        value_counts = data[col].value_counts()
        print(value_counts)
        
        # Create count plot
        plt.figure(figsize=(10, 6))
        sns.countplot(data=data, x=col)
        plt.title(f'Count Plot: {col}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

## Summary and Next Steps

Based on the initial exploration, document key findings and plan next steps.

In [None]:
# Summary of key findings
print("=" * 60)
print("KEY FINDINGS SUMMARY")
print("=" * 60)

print(f"\n1. Dataset Overview:")
print(f"   - Shape: {data.shape}")
print(f"   - Numeric columns: {len(numeric_columns)}")
print(f"   - Categorical columns: {len(categorical_columns)}")
print(f"   - Missing values: {data.isnull().sum().sum()} total")

if numeric_columns:
    print(f"\n2. Numeric Variables:")
    for col in numeric_columns:
        skewness = data[col].skew()
        missing_pct = data[col].isnull().sum() / len(data) * 100
        print(f"   - {col}: Mean={data[col].mean():.2f}, Skew={skewness:.2f}, Missing={missing_pct:.1f}%")

if categorical_columns:
    print(f"\n3. Categorical Variables:")
    for col in categorical_columns:
        unique_count = data[col].nunique()
        most_common = data[col].mode()[0] if not data[col].mode().empty else 'N/A'
        print(f"   - {col}: {unique_count} unique values, Most common: {most_common}")

print(f"\n4. Next Steps:")
print(f"   - Proceed to data cleaning notebook if issues found")
print(f"   - Continue with feature analysis notebook")
print(f"   - Explore relationships between variables")
print(f"   - Consider domain-specific analysis")

print("\n" + "=" * 60)