In [None]:
# exploratory_analysis.ipynb

# -------------------------------
# 1. Setup & Import Libraries
# -------------------------------
!pip install pandas numpy seaborn matplotlib plotly scikit-learn -q

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split

# -------------------------------
# 2. Load Dataset
# -------------------------------
# Replace with actual dataset paths
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Check the first few rows of the dataset
train_df.head()

# -------------------------------
# 3. Dataset Overview
# -------------------------------
print(f"Train Data Shape: {train_df.shape}")
print(f"Test Data Shape: {test_df.shape}")
print(f"Columns in Train Data: {train_df.columns.tolist()}")

# -------------------------------
# 4. Summary Statistics
# -------------------------------
print("Summary Statistics for Numerical Features:")
print(train_df.describe())

# Check for missing values
missing_train = train_df.isnull().sum()
missing_test = test_df.isnull().sum()

print(f"Missing values in Train Data:\n{missing_train[missing_train > 0]}")
print(f"Missing values in Test Data:\n{missing_test[missing_test > 0]}")

# -------------------------------
# 5. Data Distribution and Visualization
# -------------------------------
# Target variable distribution (assuming binary classification)
sns.countplot(x='target', data=train_df)
plt.title("Target Variable Distribution")
plt.show()

# Visualizing feature distributions (for numerical features)
numerical_features = train_df.select_dtypes(include=['float64', 'int64']).columns

for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(train_df[feature], kde=True, bins=30)
    plt.title(f"Distribution of {feature}")
    plt.show()

# -------------------------------
# 6. Correlation Matrix for Numerical Features
# -------------------------------
corr_matrix = train_df[numerical_features].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# -------------------------------
# 7. Categorical Feature Analysis (if applicable)
# -------------------------------
categorical_features = train_df.select_dtypes(include=['object']).columns

for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=feature, data=train_df)
    plt.title(f"Distribution of {feature}")
    plt.xticks(rotation=45)
    plt.show()

# -------------------------------
# 8. Feature Engineering
# -------------------------------
# Example: Encoding categorical features
train_df_encoded = pd.get_dummies(train_df, drop_first=True)

# Splitting the data into features and target
X = train_df_encoded.drop(columns=['target'])
y = train_df_encoded['target']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------------
# 9. Insights and Next Steps
# -------------------------------
# Insights summary
print(f"Total Missing Values in Train Dataset: {train_df.isnull().sum().sum()}")
print(f"Target Class Distribution in Training Set: {train_df['target'].value_counts()}")
print("Feature Engineering: Completed basic encoding and feature engineering.")

# Save the cleaned and processed data for further modeling
train_df_encoded.to_csv('data/processed_train.csv', index=False)

