# Exploratory Data Analysis (EDA)
This notebook contains exploratory data analysis for the ML regression project.

## Objectives:
- Load and explore the dataset
- Analyze variable distributions
- Identify correlations
- Detect outliers and missing values
- Prepare data for modeling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("Libraries imported successfully")

## 1. Data Loading

In [None]:
# Load the dataset
# NOTE: Replace 'your_dataset.csv' with the actual path to your file
file_path = 'your_dataset.csv'

try:
    df = pd.read_csv(file_path)
    print(f"Dataset loaded successfully: {df.shape[0]} rows and {df.shape[1]} columns")
except FileNotFoundError:
    print("File not found. Please adjust the 'file_path' variable with the correct path.")
    # Create example dataset for demonstration with 10 features
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'feature_1': np.random.normal(50, 15, n_samples),
        'feature_2': np.random.exponential(2, n_samples),
        'feature_3': np.random.uniform(0, 100, n_samples),
        'feature_4': np.random.gamma(2, 2, n_samples),
        'feature_5': np.random.beta(2, 5, n_samples) * 100,
        'feature_6': np.random.lognormal(1, 0.5, n_samples),
        'feature_7': np.random.weibull(1.5, n_samples) * 50,
        'feature_8': np.random.poisson(5, n_samples),
        'feature_9': np.random.triangular(0, 50, 100, n_samples),
        'feature_10': np.random.pareto(3, n_samples) * 10,
        'target': np.random.normal(75, 20, n_samples)
    })
    # Add artificial correlation with multiple features
    df['target'] = (0.25 * df['feature_1'] + 0.15 * df['feature_3'] + 0.1 * df['feature_4'] + 
                   0.08 * df['feature_6'] + 0.12 * df['feature_7'] + 0.05 * df['feature_9'] + 
                   np.random.normal(0, 10, n_samples))
    print(f"Example dataset created: {df.shape[0]} rows and {df.shape[1]} columns")

## 2. General Dataset Information

In [None]:
# Basic dataset information
print("=== GENERAL INFORMATION ===")
print(f"Dataset shape: {df.shape}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# First and last rows
print("=== FIRST 5 ROWS ===")
display(df.head())
print("\n=== LAST 5 ROWS ===")
display(df.tail())

In [None]:
# Descriptive statistics
print("=== DESCRIPTIVE STATISTICS ===")
display(df.describe())

## 3. Missing Values Analysis

In [None]:
# Missing values analysis
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Variable': df.columns,
    'Missing Values': missing_data,
    'Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False)

if len(missing_df) > 0:
    print("=== MISSING VALUES ===")
    display(missing_df)
    
    # Visualization of missing values
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=True, xticklabels=True, yticklabels=False, cmap='viridis')
    plt.title('Missing Values Pattern')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("✅ No missing values found in the dataset")

## 4. Distribution Analysis

In [None]:
# Distributions of numeric variables
numeric_cols = df.select_dtypes(include=[np.number]).columns
n_cols = len(numeric_cols)
n_rows = (n_cols + 2) // 3

fig, axes = plt.subplots(n_rows, 3, figsize=(18, 6*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes

for i, col in enumerate(numeric_cols):
    ax = axes[i]
    
    # Histogram with density curve
    df[col].hist(bins=30, density=True, alpha=0.7, ax=ax, color='skyblue')
    
    # Density curve
    df[col].plot.density(ax=ax, color='red', linewidth=2)
    
    ax.set_title(f'Distribution of {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('Density')
    ax.grid(True, alpha=0.3)

# Hide empty axes
for i in range(n_cols, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Box plots to detect outliers
fig, axes = plt.subplots(n_rows, 3, figsize=(18, 6*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes

for i, col in enumerate(numeric_cols):
    ax = axes[i]
    df.boxplot(column=col, ax=ax)
    ax.set_title(f'Box Plot of {col}')
    ax.set_ylabel(col)

# Hide empty axes
for i in range(n_cols, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = df[numeric_cols].corr()

# Correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True, 
            cmap='RdBu_r', 
            center=0,
            square=True,
            fmt='.2f',
            cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target variable
if 'target' in df.columns:
    target_correlations = df[numeric_cols].corrwith(df['target']).sort_values(key=abs, ascending=False)
    
    plt.figure(figsize=(10, 6))
    colors = ['red' if x < 0 else 'green' for x in target_correlations.values]
    target_correlations.plot(kind='barh', color=colors)
    plt.title('Variable Correlations with Target')
    plt.xlabel('Correlation')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("=== CORRELATIONS WITH TARGET ===")
    for var, corr in target_correlations.items():
        if var != 'target':
            print(f"{var}: {corr:.3f}")

## 6. Outlier Detection

In [None]:
# Outlier detection using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

outlier_summary = []
for col in numeric_cols:
    outliers, lower, upper = detect_outliers_iqr(df, col)
    outlier_summary.append({
        'Variable': col,
        'Outliers': len(outliers),
        'Percentage': (len(outliers) / len(df)) * 100,
        'Lower Bound': lower,
        'Upper Bound': upper
    })

outlier_df = pd.DataFrame(outlier_summary)
print("=== OUTLIERS SUMMARY (IQR Method) ===")
display(outlier_df)

## 7. Normality Analysis

In [None]:
# Normality tests: Shapiro-Wilk (for small samples) or Anderson-Darling
normality_results = []

for col in numeric_cols:
    if len(df[col].dropna()) <= 5000:  # Shapiro-Wilk for small samples
        stat, p_value = stats.shapiro(df[col].dropna())
        test_name = 'Shapiro-Wilk'
    else:  # Anderson-Darling for large samples
        result = stats.anderson(df[col].dropna())
        stat = result.statistic
        p_value = 0.05 if stat > result.critical_values[2] else 0.1  # Approximation
        test_name = 'Anderson-Darling'
    
    is_normal = p_value > 0.05
    normality_results.append({
        'Variable': col,
        'Test': test_name,
        'Statistic': stat,
        'p-value': p_value,
        'Normal': 'Yes' if is_normal else 'No'
    })

normality_df = pd.DataFrame(normality_results)
print("=== NORMALITY TESTS ===")
display(normality_df)

## 8. Scatter Plots with Target

In [None]:
# Scatter plots of variables vs target
if 'target' in df.columns:
    feature_cols = [col for col in numeric_cols if col != 'target']
    n_features = len(feature_cols)
    n_rows = (n_features + 2) // 3
    
    fig, axes = plt.subplots(n_rows, 3, figsize=(18, 6*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_features == 1 else axes
    
    for i, col in enumerate(feature_cols):
        ax = axes[i]
        ax.scatter(df[col], df['target'], alpha=0.6, s=20)
        
        # Trend line
        z = np.polyfit(df[col], df['target'], 1)
        p = np.poly1d(z)
        ax.plot(df[col], p(df[col]), "r--", alpha=0.8, linewidth=2)
        
        ax.set_xlabel(col)
        ax.set_ylabel('Target')
        ax.set_title(f'{col} vs Target')
        ax.grid(True, alpha=0.3)
    
    # Hide empty axes
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 9. Preliminary Feature Importance Analysis

In [None]:
# Feature importance using absolute correlation and mutual information
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler

if 'target' in df.columns:
    feature_cols = [col for col in numeric_cols if col != 'target']
    X = df[feature_cols]
    y = df['target']
    
    # Remove rows with missing values
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X_clean = X[mask]
    y_clean = y[mask]
    
    # Normalize for mutual information
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_clean)
    
    # Calculate mutual information
    mi_scores = mutual_info_regression(X_scaled, y_clean)
    
    # Create DataFrame with results
    importance_df = pd.DataFrame({
        'Variable': feature_cols,
        'Abs_Correlation': [abs(df[col].corr(df['target'])) for col in feature_cols],
        'Mutual_Information': mi_scores
    })
    
    # Normalize mutual information for comparison
    importance_df['MI_Normalized'] = importance_df['Mutual_Information'] / importance_df['Mutual_Information'].max()
    
    importance_df = importance_df.sort_values('MI_Normalized', ascending=False)
    
    print("=== PRELIMINARY FEATURE IMPORTANCE ===")
    display(importance_df)
    
    # Visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Absolute correlation
    importance_df.set_index('Variable')['Abs_Correlation'].plot(kind='barh', ax=ax1, color='lightblue')
    ax1.set_title('Importance by Absolute Correlation')
    ax1.set_xlabel('Absolute Correlation')
    
    # Mutual Information
    importance_df.set_index('Variable')['MI_Normalized'].plot(kind='barh', ax=ax2, color='lightgreen')
    ax2.set_title('Importance by Mutual Information')
    ax2.set_xlabel('Normalized MI')
    
    plt.tight_layout()
    plt.show()

## 10. Summary and Conclusions

In [None]:
print("=== EXPLORATORY DATA ANALYSIS SUMMARY ===")
print(f"\n📊 GENERAL INFORMATION:")
print(f"   • Dataset: {df.shape[0]} rows x {df.shape[1]} columns")
print(f"   • Numeric variables: {len(numeric_cols)}")
print(f"   • Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

if len(missing_df) > 0:
    print(f"\n❌ MISSING VALUES:")
    for _, row in missing_df.iterrows():
        print(f"   • {row['Variable']}: {row['Missing Values']} ({row['Percentage']:.1f}%)")
else:
    print(f"\n✅ MISSING VALUES: None")

print(f"\n🎯 DETECTED OUTLIERS:")
for _, row in outlier_df.iterrows():
    if row['Outliers'] > 0:
        print(f"   • {row['Variable']}: {row['Outliers']} ({row['Percentage']:.1f}%)")

if 'target' in df.columns:
    print(f"\n🔗 STRONGEST CORRELATIONS WITH TARGET:")
    top_corrs = target_correlations.drop('target').head(3)
    for var, corr in top_corrs.items():
        print(f"   • {var}: {corr:.3f}")

print(f"\n📈 DISTRIBUTIONS:")
normal_vars = normality_df[normality_df['Normal'] == 'Yes']['Variable'].tolist()
if normal_vars:
    print(f"   • Variables with normal distribution: {', '.join(normal_vars)}")
else:
    print(f"   • No variables follow normal distribution")

print(f"\n🚀 RECOMMENDATIONS FOR MODELING:")
print(f"   • Consider transformations for non-normal variables")
print(f"   • Evaluate outlier treatment")
print(f"   • Use feature scaling for scale-sensitive algorithms")
print(f"   • Consider feature engineering based on correlations")
if len(missing_df) > 0:
    print(f"   • Implement imputation strategy for missing values")