In [None]:
# Data Exploration

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add project root to path to import from src
sys.path.append('..')
from src.data.preprocessing import load_data, clean_data

# Set plotting style
plt.style.use('ggplot')
sns.set_theme()

# Generate synthetic data for exploration
np.random.seed(42)
n_samples = 1000

# Create features
data = {
    'feature1': np.random.normal(0, 1, n_samples),
    'feature2': np.random.normal(5, 2, n_samples),
    'feature3': np.random.uniform(0, 10, n_samples),
    'categorical': np.random.choice(['A', 'B', 'C'], n_samples)
}

# Generate target variables (regression and classification)
data['target'] = 0.5 * data['feature1'] - 0.2 * data['feature2'] + 0.1 * data['feature3'] + np.random.normal(0, 0.5, n_samples)
data['target_binary'] = (data['target'] > 0).astype(int)

# Create DataFrame
df = pd.DataFrame(data)

# Add some missing values
mask = np.random.rand(n_samples) < 0.05  # 5% missing values
df.loc[mask, 'feature1'] = np.nan
df.loc[mask, 'categorical'] = np.nan

# Dataset overview
print("Dataset shape:", df.shape)
print("\nColumn types:")
print(df.dtypes)
print("\nMissing values per column:")
print(df.isna().sum())

# Display first few rows
df.head()

# Summary statistics
df.describe(include='all')

# Clean data
df_clean = clean_data(df.copy())
print("Missing values after cleaning:")
print(df_clean.isna().sum())

# Distribution of features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

sns.histplot(df_clean['feature1'], kde=True, ax=axes[0])
axes[0].set_title('Distribution of Feature 1')

sns.histplot(df_clean['feature2'], kde=True, ax=axes[1])
axes[1].set_title('Distribution of Feature 2')

sns.histplot(df_clean['feature3'], kde=True, ax=axes[2])
axes[2].set_title('Distribution of Feature 3')

sns.countplot(x='categorical', data=df_clean, ax=axes[3])
axes[3].set_title('Distribution of Categorical Feature')

plt.tight_layout()
plt.show()

# Correlation matrix for numerical features
corr_matrix = df_clean[['feature1', 'feature2', 'feature3', 'target', 'target_binary']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

# Feature relationships with target
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.scatterplot(x='feature1', y='target', data=df_clean, ax=axes[0])
axes[0].set_title('Feature 1 vs. Target')

sns.scatterplot(x='feature2', y='target', data=df_clean, ax=axes[1])
axes[1].set_title('Feature 2 vs. Target')

sns.scatterplot(x='feature3', y='target', data=df_clean, ax=axes[2])
axes[2].set_title('Feature 3 vs. Target')

plt.tight_layout()
plt.show()

# Save the dataset to CSV for future use
output_dir = '../data/raw'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'synthetic_data.csv')
df.to_csv(output_path, index=False)
print(f"Dataset saved to {output_path}")