# 🔮 Intelligent Predictor PRO++++ — EDA Development Notebook

**Purpose:** Development and testing environment for Exploratory Data Analysis features

**Author:** Data Science Team  
**Date:** 2025-10-09  
**Version:** 2.0.1

---

## 📋 Table of Contents

1. [Setup & Imports](#setup)
2. [Data Loading](#data-loading)
3. [Data Quality Assessment](#data-quality)
4. [Statistical Analysis](#statistical-analysis)
5. [Univariate Analysis](#univariate)
6. [Bivariate Analysis](#bivariate)
7. [Multivariate Analysis](#multivariate)
8. [Advanced Visualizations](#advanced-viz)
9. [Automated Profiling](#profiling)
10. [Dashboard Creation](#dashboards)
11. [Export & Reports](#export)

---

## 1. Setup & Imports

Defensive imports: try to use project modules if available, otherwise fall back to inline helpers.

In [None]:
# === STANDARD LIB ===
import os, sys, json
from pathlib import Path
from typing import List, Dict, Any, Optional
import warnings
warnings.filterwarnings('ignore')

# === DATA ===
import pandas as pd
import numpy as np
from scipy import stats

# === VIZ ===
import matplotlib.pyplot as plt
try:
    import seaborn as sns
    _HAS_SNS = True
except Exception:
    _HAS_SNS = False
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# === PROJECT MODULES (optional) ===
sys.path.insert(0, str(Path.cwd()))
sys.path.insert(0, str(Path.cwd().parent))
try:
    from src.data_processing.file_parser import parse_csv_smart, parse_any
except Exception:
    parse_csv_smart = None
    def parse_any(path: str) -> pd.DataFrame:
        if path.lower().endswith('.csv'):
            return pd.read_csv(path)
        elif path.lower().endswith(('.xlsx', '.xls')):
            return pd.read_excel(path)
        else:
            raise ValueError(f"Unsupported format: {path}")
try:
    from src.data_processing.data_cleaner import clean_data
except Exception:
    def clean_data(df: pd.DataFrame, remove_duplicates=True, handle_missing='auto', detect_outliers=True) -> pd.DataFrame:
        out = df.copy()
        if remove_duplicates:
            out = out.drop_duplicates()
        if handle_missing == 'auto':
            num_cols = out.select_dtypes(include=[np.number]).columns
            for c in num_cols:
                out[c] = out[c].fillna(out[c].median())
            cat_cols = out.select_dtypes(include=['object', 'category']).columns
            for c in cat_cols:
                out[c] = out[c].fillna(out[c].mode().iloc[0] if out[c].mode().size else out[c])
        return out
try:
    from src.data_processing.data_validator import validate_dataframe_for_ml
except Exception:
    def validate_dataframe_for_ml(df: pd.DataFrame, target: Optional[str]=None) -> Dict[str, Any]:
        errors, warnings_ = [], []
        if not isinstance(df, pd.DataFrame):
            errors.append('Input is not a DataFrame')
        if target and target not in df.columns:
            errors.append(f"Target '{target}' not found")
        if df.empty:
            warnings_.append('DataFrame is empty')
        return {'is_valid': len(errors)==0, 'errors': errors, 'warnings': warnings_}
try:
    from src.data_processing.data_profiler import generate_profile
except Exception:
    def generate_profile(df: pd.DataFrame, title: str='Profile', dark_mode: bool=True, minimal: bool=False) -> str:
        html = [f"<h1>{title}</h1>", f"<p>Rows: {len(df):,}, Cols: {df.shape[1]}</p>"]
        html.append(df.head().to_html(index=False))
        return "\n".join(html)

# === SIMPLE VIZ HELPERS (fallbacks) ===
def histogram(df, col, binning_method='fd', title='Histogram', **kwargs):
    return px.histogram(df, x=col, nbins=None, title=title, marginal=kwargs.get('marginal'))
def scatter(df, x, y, title='Scatter', **kwargs):
    return px.scatter(df, x=x, y=y, trendline=kwargs.get('trendline'))
def line(df, x, y, title='Line', **kwargs):
    fig = px.line(df, x=x, y=y, title=title)
    if kwargs.get('show_smoothing'):
        w = int(kwargs.get('smoothing_window', 7))
        s = df[y].rolling(w).mean()
        fig.add_trace(go.Scatter(x=df[x], y=s, name=f"MA_{w}"))
    return fig
def box(df, y=None, x=None, title='Box', **kwargs):
    return px.box(df, x=x, y=y, points=kwargs.get('points', 'outliers'), title=title)
def violin(df, x=None, y=None, title='Violin', **kwargs):
    return px.violin(df, x=x, y=y, box=kwargs.get('box', True), points=kwargs.get('points', 'outliers'), title=title)
def correlation_heatmap(df, method='pearson', annotate=False, cluster=False, cmap='RdBu_r', title='Correlation'):
    corr = df.select_dtypes(include=[np.number]).corr(method=method)
    fig = px.imshow(corr, title=title, color_continuous_scale='RdBu', aspect='auto')
    return fig
def scatter_3d(df, x, y, z, color=None, title='3D Scatter'):
    return px.scatter_3d(df, x=x, y=y, z=z, color=color, title=title)

plt.style.use('seaborn-v0_8-darkgrid') if hasattr(plt, 'style') else None
print("✅ Imports OK | Pandas:", pd.__version__, "| NumPy:", np.__version__, "| Plotly:", px.__version__)

### Helper Functions

In [None]:
def display_info(df: pd.DataFrame, title: str = "Dataset Info"):
    print(f"\n{'='*60}\n📊 {title}\n{'='*60}")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"Memory: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB")
    dups = int(df.duplicated().sum())
    miss = int(df.isna().sum().sum())
    print(f"Duplicates: {dups:,} ({(dups/len(df)*100 if len(df)>0 else 0):.2f}%)")
    print(f"Missing: {miss:,} values ({(miss/df.size*100 if df.size>0 else 0):.2f}%)")
    print("\n📈 Data Types:")
    print(df.dtypes.value_counts())
    print("\n🔍 Missing Values by Column:")
    missing = df.isna().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    if len(missing) > 0:
        for col, count in missing.items():
            pct = count / len(df) * 100 if len(df)>0 else 0
            print(f"  {col}: {count:,} ({pct:.2f}%)")
    else:
        print("  ✅ No missing values!")

def describe_numeric(df: pd.DataFrame, cols: Optional[List[str]] = None):
    if cols is None:
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not cols:
        return pd.DataFrame()
    desc = df[cols].describe().T
    desc['skew'] = df[cols].skew(numeric_only=True)
    desc['kurtosis'] = df[cols].kurtosis(numeric_only=True)
    desc['missing'] = df[cols].isna().sum()
    desc['missing_pct'] = (desc['missing'] / max(len(df),1) * 100).round(2)
    return desc

def describe_categorical(df: pd.DataFrame, cols: Optional[List[str]] = None):
    if cols is None:
        cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    result = []
    for col in cols:
        vc = df[col].value_counts(dropna=True)
        top = vc.index[0] if not vc.empty else None
        top_freq = int(vc.iloc[0]) if not vc.empty else 0
        result.append({
            'column': col,
            'unique': int(df[col].nunique(dropna=True)),
            'top': top,
            'top_freq': top_freq,
            'missing': int(df[col].isna().sum()),
            'missing_pct': float(df[col].isna().sum() / max(len(df),1) * 100)
        })
    return pd.DataFrame(result)

print("✅ Helper functions loaded!")

## 2. Data Loading

Load synthetic or real data for EDA development and testing.

In [None]:
def generate_sample_data(n_rows: int = 1000, seed: int = 42) -> pd.DataFrame:
    np.random.seed(seed)
    dates = pd.date_range('2023-01-01', periods=n_rows, freq='D')
    df = pd.DataFrame({
        'date': dates,
        'sales': 100 + 0.3 * np.arange(n_rows) + 15 * np.sin(2 * np.pi * np.arange(n_rows) / 7) + np.random.normal(0, 5, n_rows),
        'customers': np.random.poisson(50, n_rows),
        'revenue': np.random.uniform(1000, 10000, n_rows),
        'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
        'product_category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], n_rows),
        'satisfaction': np.random.uniform(1, 5, n_rows),
        'discount': np.random.choice([0, 5, 10, 15, 20], n_rows, p=[0.4, 0.2, 0.2, 0.15, 0.05]),
        'is_weekend': (dates.dayofweek >= 5).astype(int),
        'temperature': 15 + 10 * np.sin(2 * np.pi * np.arange(n_rows) / 365) + np.random.normal(0, 3, n_rows),
    })
    df.loc[np.random.choice(df.index, size=int(0.05 * n_rows), replace=False), 'satisfaction'] = np.nan
    df.loc[np.random.choice(df.index, size=int(0.02 * n_rows), replace=False), 'temperature'] = np.nan
    return df

df = generate_sample_data(n_rows=1000)
display_info(df, "Generated Sample Dataset")
df.head()

## 3. Data Quality Assessment

In [None]:
validation = validate_dataframe_for_ml(df, target='sales')
print("🔍 Validation Results:")
print(f"\nValid for ML: {'✅ Yes' if validation['is_valid'] else '❌ No'}")
if validation['errors']:
    print("\n⚠️ Errors:")
    for e in validation['errors']:
        print(' -', e)
if validation['warnings']:
    print("\n⚠️ Warnings:")
    for w in validation['warnings']:
        print(' -', w)

df_clean = clean_data(df.copy(), remove_duplicates=True, handle_missing='auto', detect_outliers=True)
print("\n🧹 Cleaning done. Rows:", len(df), '→', len(df_clean))

## 4. Statistical Analysis

In [None]:
print("📊 Numeric Statistics (Enhanced):\n")
numeric_stats = describe_numeric(df_clean)
numeric_stats.round(3)

In [None]:
print("📊 Categorical Statistics:\n")
cat_stats = describe_categorical(df_clean)
cat_stats

In [None]:
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df_clean[numeric_cols].corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
corr_pairs = correlation_matrix.mask(mask).stack().reset_index()
corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
corr_pairs = corr_pairs[abs(corr_pairs['Correlation']) > 0.3].sort_values('Correlation', ascending=False)
corr_pairs.head(10)

## 5. Univariate Analysis

In [None]:
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols[:4]:
    fig = histogram(df_clean, col=col, binning_method='freedman_diaconis',
                    show_mean=True, show_median=True, show_std=True, marginal='box',
                    title=f"Distribution of {col}")
    fig.show()
    try:
        _, p_value = stats.normaltest(df_clean[col].dropna())
        print(f"\n{col}: Normality test p-value: {p_value:.4f} | ", '✅ Normal' if p_value>0.05 else '❌ Not normal')
    except Exception as e:
        print(f"\n{col}: normality test skipped ({e})")
    print('-'*50)

In [None]:
categorical_cols = df_clean.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_cols:
    value_counts = df_clean[col].value_counts()
    fig = go.Figure()
    fig.add_trace(go.Bar(x=value_counts.index.astype(str), y=value_counts.values, text=value_counts.values, textposition='auto'))
    fig.update_layout(title=f"Distribution of {col}", xaxis_title=col, yaxis_title='Count', template='plotly_white', height=400)
    fig.show()
    print(f"\n{col}: Unique values: {df_clean[col].nunique()} | Most common: {value_counts.index[0] if len(value_counts)>0 else 'N/A'}")
    print('-'*50)

In [None]:
for col in numeric_cols[:4]:
    fig = box(df_clean, y=col, title=f"Box Plot: {col}", points='outliers')
    fig.show()
    Q1, Q3 = df_clean[col].quantile(0.25), df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df_clean[(df_clean[col] < Q1 - 1.5*IQR) | (df_clean[col] > Q3 + 1.5*IQR)]
    print(f"\n{col}: Outliers: {len(outliers)} ({(len(outliers)/max(len(df_clean),1)*100):.2f}%)")
    print('-'*50)

## 6. Bivariate Analysis

In [None]:
target_col = 'sales'
for col in numeric_cols:
    if col != target_col:
        fig = scatter(df_clean, x=col, y=target_col, title=f"{target_col} vs {col}", trendline='ols')
        fig.show()
        corr = df_clean[[col, target_col]].corr().iloc[0,1]
        print(f"Correlation ({col} vs {target_col}): {corr:.4f}")
        print('-'*50)

In [None]:
for col in categorical_cols:
    fig = box(df_clean, x=col, y=target_col, title=f"{target_col} by {col}", points='outliers')
    fig.show()
    groups = [df_clean[df_clean[col]==cat][target_col].dropna() for cat in df_clean[col].dropna().unique()]
    if len(groups) >= 2 and all(len(g)>1 for g in groups):
        f_stat, p_value = stats.f_oneway(*groups)
        print(f"ANOVA ({col} vs {target_col}): F={f_stat:.4f}, p={p_value:.4f} | ", '✅ Significant' if p_value<0.05 else '❌ Not significant')
    else:
        print(f"ANOVA skipped for {col} (insufficient groups)")
    print('-'*50)

In [None]:
fig = correlation_heatmap(df_clean, method='pearson', annotate=True, cluster=True, cmap='RdBu_r', title='Correlation Matrix (Hierarchical)')
fig.show()

## 7. Multivariate Analysis

In [None]:
cols_subset = numeric_cols[:4]
if _HAS_SNS and len(cols_subset)>=2:
    g = sns.pairplot(df_clean[cols_subset], diag_kind='kde', plot_kws={'alpha':0.6}, height=2.5)
    plt.suptitle('Pairwise Relationships', y=1.02, fontsize=14)
    plt.show()
else:
    print('Pairplot skipped (seaborn not installed or insufficient numeric columns).')

In [None]:
if len(numeric_cols) >= 3:
    fig = scatter_3d(df_clean, x=numeric_cols[0], y=numeric_cols[1], z=numeric_cols[2], color=categorical_cols[0] if categorical_cols else None,
                     title=f"3D Scatter: {numeric_cols[0]} vs {numeric_cols[1]} vs {numeric_cols[2]}")
    fig.show()

In [None]:
cols_for_parallel = numeric_cols[:5]
if len(cols_for_parallel)>=2:
    color_col = categorical_cols[0] if categorical_cols else cols_for_parallel[0]
    fig = px.parallel_coordinates(df_clean, dimensions=cols_for_parallel, color=df_clean[color_col] if color_col in df_clean.columns else None,
                                  title='Parallel Coordinates Plot')
    fig.show()
else:
    print('Parallel coordinates skipped (insufficient columns).')

## 8. Advanced Visualizations

In [None]:
if 'date' in df_clean.columns:
    fig = line(df_clean, x='date', y='sales', title='Sales Over Time', show_smoothing=True, smoothing_window=7)
    fig.show()
    df_clean['MA_7'] = df_clean['sales'].rolling(window=7).mean()
    df_clean['MA_30'] = df_clean['sales'].rolling(window=30).mean()
    fig2 = go.Figure()
    fig2.add_trace(go.Scatter(x=df_clean['date'], y=df_clean['sales'], name='Sales', mode='lines', opacity=0.5))
    fig2.add_trace(go.Scatter(x=df_clean['date'], y=df_clean['MA_7'], name='7-Day MA', mode='lines'))
    fig2.add_trace(go.Scatter(x=df_clean['date'], y=df_clean['MA_30'], name='30-Day MA', mode='lines'))
    fig2.update_layout(title='Sales with Moving Averages', xaxis_title='Date', yaxis_title='Sales', template='plotly_white', hovermode='x unified')
    fig2.show()

In [None]:
if len(categorical_cols)>0 and len(numeric_cols)>0:
    for num_col in numeric_cols[:2]:
        fig = violin(df_clean, x=categorical_cols[0], y=num_col, box=True, points='outliers',
                     title=f"{num_col} Distribution by {categorical_cols[0]}")
        fig.show()

In [None]:
agg_df = None
if len(categorical_cols) >= 2:
    agg_df = df_clean.groupby(categorical_cols[:2])['sales'].agg(['sum', 'mean', 'count']).reset_index()
    fig = px.sunburst(agg_df, path=categorical_cols[:2], values='sum', title='Sales Distribution by Categories', color='mean', color_continuous_scale='RdYlGn')
    fig.show()

In [None]:
if isinstance(agg_df, pd.DataFrame):
    fig = px.treemap(agg_df, path=categorical_cols[:2], values='sum', color='mean', title='Sales Treemap', color_continuous_scale='Viridis')
    fig.show()

In [None]:
if len(numeric_cols) >= 2:
    fig = px.density_contour(df_clean, x=numeric_cols[0], y=numeric_cols[1], title=f"Density Contour: {numeric_cols[0]} vs {numeric_cols[1]}",
                              marginal_x='histogram', marginal_y='histogram')
    fig.update_traces(contours_coloring='fill', contours_showlabels=True)
    fig.show()

## 9. Automated Profiling

In [None]:
try:
    profile_html = generate_profile(df_clean, title='Comprehensive Data Profile', dark_mode=True, minimal=False)
    output_path = Path('data/exports/eda_profile_report.html')
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(profile_html, encoding='utf-8')
    print(f"✅ Profile report saved to: {output_path}")
except Exception as e:
    print(f"⚠️ Could not generate profile report: {e}\nThis is optional - continuing...")

## 10. Dashboard Creation

In [None]:
fig = make_subplots(rows=2, cols=2, subplot_titles=(
    'Sales Distribution', 'Sales Over Time', 'Sales by Region', 'Revenue vs Sales'
), specs=[[{"type":"histogram"}, {"type":"scatter"}], [{"type":"box"}, {"type":"scatter"}]])
fig.add_trace(go.Histogram(x=df_clean['sales'], name='Sales'), row=1, col=1)
if 'date' in df_clean.columns:
    fig.add_trace(go.Scatter(x=df_clean['date'], y=df_clean['sales'], mode='lines', name='Sales'), row=1, col=2)
if len(categorical_cols)>0:
    for category in df_clean[categorical_cols[0]].dropna().unique():
        data = df_clean[df_clean[categorical_cols[0]] == category]['sales']
        fig.add_trace(go.Box(y=data, name=str(category)), row=2, col=1)
fig.add_trace(go.Scatter(x=df_clean['revenue'], y=df_clean['sales'], mode='markers', name='Revenue vs Sales'), row=2, col=2)
fig.update_layout(height=800, showlegend=False, title_text='Comprehensive EDA Dashboard', template='plotly_white')
fig.show()
print('✅ Combined dashboard created!')

## 11. Export & Reports

In [None]:
export_dir = Path('data/exports/eda_notebook')
export_dir.mkdir(parents=True, exist_ok=True)
print('📤 Exporting visualizations...')
hist_fig = histogram(df_clean, col='sales', binning_method='freedman_diaconis', title='Sales Distribution')
hist_fig.write_html(str(export_dir / 'sales_histogram.html'))
corr_fig = correlation_heatmap(df_clean, method='pearson', annotate=True, title='Correlation Matrix')
corr_fig.write_html(str(export_dir / 'correlation_heatmap.html'))
fig.write_html(str(export_dir / 'eda_dashboard.html'))
print(f"✅ Saved to: {export_dir}")

In [None]:
df_clean.to_csv(export_dir / 'cleaned_data.csv', index=False)
summary_stats = {
    'numeric_stats': describe_numeric(df_clean).to_dict(),
    'categorical_stats': describe_categorical(df_clean).to_dict(),
    'correlation_matrix': correlation_matrix.to_dict()
}
with open(export_dir / 'summary_statistics.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)
print('✅ Saved: cleaned_data.csv, summary_statistics.json')