In [None]:
# Exploratory Data Analysis (EDA) Script
# notebooks/eda.ipynb

# Ensure required packages are installed in the notebook environment
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import sys
import os
sys.path.append('../src')  # so we can import the data_processing module


# Robust import: 
try:
	from data_processing import DataLoader, EDA, DataProcessor
except Exception:
	import importlib
	try:
		dp = importlib.import_module('data_processing')
	except Exception as ie:
		raise ImportError(f"Failed to import data_processing module from ../src: {ie}") from ie

	# show what's available to help diagnose naming differences
	available = [n for n in dir(dp) if not n.startswith('_')]
	print("Available in data_processing:", available)

	# attempt to bind expected names (try common alternatives for DataProcessor)
	DataLoader = getattr(dp, 'DataLoader', None)
	EDA = getattr(dp, 'EDA', None)
	DataProcessor = getattr(dp, 'DataProcessor', None)
	if DataProcessor is None:
		for alt in ('Processor', 'DataProc', 'DataProcessorClass', 'Data_Processor'):
			if hasattr(dp, alt):
				DataProcessor = getattr(dp, alt)
				print(f"Using alternative DataProcessor name: {alt}")
				break

	missing = [name for name, obj in [('DataLoader', DataLoader), ('EDA', EDA), ('DataProcessor', DataProcessor)] if obj is None]
	if missing:
		raise ImportError(f"Could not import {missing} from data_processing. Check class names/exports. Available: {available}")

# Load raw data
data_path = '../data/raw/data.csv'  # Change filename as needed
loader = DataLoader(data_path)
raw_df = loader.load_data()

# Quick overview of the raw data
raw_eda = EDA(raw_df)
print("Raw data preview:")
print(raw_eda.dataset_overview()['preview'])
print()
print("Raw missing values (diagnostics):")
from IPython.display import display, Markdown
raw_miss = raw_eda.missing_values_table()

# suggestion map can be used when we have missing-value info to annotate suggestions
suggestion_map = {
	'none': 'No action needed',
	'median': 'Impute numeric values with median',
	'median_or_model': 'Impute with median or consider model-based imputation',
	'consider_drop_or_model': 'Consider dropping or using model-based imputation',
	'mode': 'Impute categorical values with mode',
	'mode_with_flag': 'Impute with mode and add missing flag',
	'consider_drop_or_new_category': 'Consider dropping column or create a new category'
}

if raw_miss.empty:
	display(Markdown('**Missing Values:** None detected — no NA values found in the raw dataset.'))
	# define an empty disp DataFrame so later code that expects `disp` won't fail
	disp = pd.DataFrame(columns=['Missing', 'Percent', 'Dtype', 'Nunique', 'Sample', 'Suggestion'])
else:
	disp = raw_miss.copy()
	disp['Percent'] = disp['Percent'].map(lambda x: f'{x:.1f}%')
	disp['Sample'] = disp['Sample'].map(lambda s: ', '.join(map(str, s)) if isinstance(s, (list, tuple)) else str(s))
	disp['Suggestion_Explanation'] = disp['Suggestion'].map(suggestion_map)

display(disp)


In [None]:

# 2. Summary Statistics
print("\nSummary Statistics:\n", raw_eda.summary_statistics())


In [None]:

# 3. Numerical Feature Distributions
num_cols = raw_df.select_dtypes(include=np.number).columns.tolist()
# pick a few if there are many
num_cols = num_cols[:6] if len(num_cols) > 6 else num_cols
print('Numeric columns to plot:', num_cols)
raw_eda.plot_numerical_distributions(num_cols=num_cols)


In [None]:

# 4. Categorical Feature Distributions
cat_cols = raw_df.select_dtypes(include='object').columns.tolist()
cat_cols = cat_cols[:6] if len(cat_cols) > 6 else cat_cols
print('Categorical columns to plot:', cat_cols)
# Plot each categorical column. For very high-cardinality columns show top-20 only
for col in cat_cols:
    nuniq = raw_df[col].nunique(dropna=True)
    if nuniq <= 50:
        # reasonable cardinality -> full bar plot
        raw_eda.plot_categorical_distributions(cat_cols=[col], max_unique=50)
    else:
        print(f'Column {col} has high cardinality ({nuniq}). Showing top 20 values only.')
        ax = raw_df[col].value_counts().head(20).plot(kind='bar', figsize=(10,3))
        ax.set_title(f'Top 20 values for {col}')
        plt.tight_layout()
        plt.show()


In [None]:

# 5. Correlation Analysis
corr = raw_eda.correlation_matrix()


In [None]:

# 6. Missing Values
print("\nMissing Values Table (raw):\n", raw_eda.missing_values_table())


In [None]:

# 7. Outlier Detection
raw_eda.boxplot_outliers(num_cols=num_cols)


In [None]:
# 8. Processing (Task 2): normalize missing values and create processed dataset
proc = DataProcessor(raw_df)
# Use default processing config; customize as needed:
cfg = DataProcessor.ProcessConfig()
processed_df = proc.process(cfg)
processed_eda = EDA(processed_df)

print('Processed data preview:')
print(processed_eda.dataset_overview()['preview'])

print('Missing values (processed):')
from IPython.display import display, Markdown
proc_miss = processed_eda.missing_values_table()
if proc_miss.empty:
    display(Markdown('**Missing Values (processed):** None detected — preprocessing handled NA values.'))
else:
    disp2 = proc_miss.copy()
    disp2['Percent'] = disp2['Percent'].map(lambda x: f'{x:.1f}%')
    disp2['Sample'] = disp2['Sample'].map(lambda s: ', '.join(map(str, s)) if isinstance(s, (list, tuple)) else str(s))
    # reuse suggestion_map defined earlier if present, else create fallback
    try:
        disp2['Suggestion_Explanation'] = disp2['Suggestion'].map(suggestion_map)
    except NameError:
        disp2['Suggestion_Explanation'] = disp2['Suggestion']
    display(disp2)

# Compare missing counts before / after for visualization
raw_miss = raw_eda.missing_values_table()
proc_miss = processed_eda.missing_values_table()
cmp = raw_miss[['Missing']].rename(columns={'Missing':'RawMissing'})
if not proc_miss.empty:
    cmp = cmp.join(proc_miss['Missing'].rename('ProcMissing'), how='left')
else:
    cmp['ProcMissing'] = 0
cmp.fillna(0, inplace=True)

if not cmp.empty:
    ax = cmp.sort_values('RawMissing', ascending=False).head(20).plot(kind='bar', figsize=(10,4))
    ax.set_xlabel('Column')
    ax.set_ylabel('Missing count')
    ax.set_title('Missing values: raw vs processed (top 20)')
    plt.tight_layout()
    plt.show()


In [None]:

# Save processed and create a train/test split under repo-root data/processed/
out_path = proc.save_processed('../data/processed/processed_data.csv')

print('Processed data saved to', os.path.abspath(out_path))

# Provide explicit split paths that point to repo-root `data/processed/`
split_cfg = DataProcessor.SplitConfig(train_path='../data/processed/train.csv', test_path='../data/processed/test.csv')
train_path, test_path = proc.split_save(split_cfg)
print('Train/Test saved to', os.path.abspath(train_path), os.path.abspath(test_path))


In [None]:
# Auto-generate Top Insights from processed data (data-driven)
from IPython.display import Markdown, display
import numpy as np

# Prefer processed DataFrame in memory, else try reading the processed CSV, else use raw_df
if 'proc' in globals() and getattr(proc, 'df', None) is not None:
    df_inspect = proc.df.copy()
elif os.path.exists('../data/processed/processed_data.csv'):
    df_inspect = pd.read_csv('../data/processed/processed_data.csv')
else:
    df_inspect = raw_df.copy()

n_rows, n_cols = df_inspect.shape
numeric_cols = df_inspect.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df_inspect.select_dtypes(include=['object', 'category']).columns.tolist()

# 1) Data concentration in categorical columns
concentrated = []
for c in cat_cols:
    top_share = df_inspect[c].value_counts(dropna=True, normalize=True).iloc[0] if df_inspect[c].dropna().shape[0] > 0 else 0
    if top_share >= 0.5:
        concentrated.append((c, float(top_share), df_inspect[c].value_counts().index[0]))

# 2) Missing value summary
miss = df_inspect.isnull().sum()
miss_pct = (miss / max(1, n_rows) * 100).round(1)
miss_table = pd.DataFrame({'Missing': miss, 'Percent': miss_pct})
miss_table = miss_table[miss_table['Missing'] > 0].sort_values('Missing', ascending=False)
top_missing = miss_table.head(5)

# 3) Outlier detection (z-score heuristic)
outlier_counts = {}
for c in numeric_cols:
    col = df_inspect[c].dropna()
    if col.shape[0] > 0 and col.std(ddof=0) > 0:
        z = (col - col.mean()) / col.std(ddof=0)
        outlier_counts[c] = int((z.abs() > 3).sum())
    else:
        outlier_counts[c] = 0
outliers_df = pd.Series(outlier_counts).sort_values(ascending=False).rename('OutlierCount')
top_outliers = outliers_df[outliers_df > 0].head(5)

# 4) Strong correlations among numeric features
corr_pairs = []
if len(numeric_cols) >= 2:
    corr = df_inspect[numeric_cols].corr().abs()
    for i, a in enumerate(corr.columns):
        for b in corr.columns[i+1:]:
            val = corr.loc[a, b]
            if pd.notna(val) and val >= 0.7:
                corr_pairs.append((a, b, float(val)))
    corr_pairs = sorted(corr_pairs, key=lambda x: -x[2])

# 5) Categorical diversity (high-cardinality columns)
cat_card = [(c, int(df_inspect[c].nunique(dropna=True))) for c in cat_cols]
cat_card_sorted = sorted(cat_card, key=lambda x: -x[1])

# Build human-friendly insights
insights_lines = []
insights_lines.append(f"**Dataset:** {n_rows:,} rows × {n_cols:,} columns.")

if concentrated:
    for c, share, topv in concentrated:
        insights_lines.append(f"**Concentration:** `{c}` is concentrated: top value `{topv}` covers {share*100:.1f}% of non-null rows.")
else:
    insights_lines.append("**Concentration:** No categorical column has a single value covering >=50% of non-null rows.")

if not top_missing.empty:
    rows = [f"`{idx}`: {int(r['Missing']):,} ({r['Percent']}%)" for idx, r in top_missing.iterrows()]
    insights_lines.append("**Missing values (top columns):** " + "; ".join(rows))
else:
    insights_lines.append("**Missing values:** No missing values detected after processing.")

if not top_outliers.empty:
    out_rows = [f"`{idx}`: {int(cnt)} outliers" for idx, cnt in top_outliers.items()]
    insights_lines.append("**Outliers (z-score > 3), top numeric columns:** " + "; ".join(out_rows))
else:
    insights_lines.append("**Outliers:** No extreme outliers (z-score > 3) found in numeric columns.")

if corr_pairs:
    top_corr = ", ".join([f"`{a}`↔`{b}`={val:.2f}" for a, b, val in corr_pairs[:5]])
    insights_lines.append("**Strong correlations (|r|>=0.7):** " + top_corr)
else:
    insights_lines.append("**Correlations:** No strong correlations (|r|>=0.7) found among numeric features.")

if cat_card_sorted:
    top_high = cat_card_sorted[:3]
    top_low = sorted(cat_card_sorted, key=lambda x: x[1])[:3]
    insights_lines.append("**Categorical diversity:** top high-cardinality columns: " + ", ".join([f"`{c}`({n})" for c, n in top_high]))
    insights_lines.append("**Categorical diversity:** top low-cardinality columns: " + ", ".join([f"`{c}`({n})" for c, n in top_low]))

# Display
display(Markdown("### Top EDA Insights (data-driven)"))
for line in insights_lines:
    display(Markdown(f"- {line}"))

# Also show supporting small tables for quick inspection
if not miss_table.empty:
    display(Markdown('**Missing values (top 10)**'))
    display(miss_table.head(10))
if not top_outliers.empty:
    display(Markdown('**Outlier counts (top numeric columns)**'))
    display(top_outliers.head(10))
if corr_pairs:
    display(Markdown('**Strong correlation pairs (|r|>=0.7)**'))
    display(pd.DataFrame(corr_pairs, columns=['feature_a', 'feature_b', 'abs_corr']).head(10))
