#### Final Data Preparation and EDA Script
  - **AWS S3 Configuration**
    - Define bucket and prefixes
    - Set up local directories
    - Initialize S3 client

  - **Fetching Latest Data**
    - Retrieve latest train and API files from S3

  - **Downloading Data**
    - Load CSV files from S3 into Pandas DataFrames

  - **Data Preprocessing**
    - Encode Categorical Variables
    - Handle Outliers using IQR Method

  - **Exploratory Data Analysis (EDA)**
    - Generate Statistical Summary
    - Save and Upload EDA Reports
    - Generate and Upload Visualizations
      - **Histograms**
        - Credit Score
        - Age
        - Balance
        - Number of Products
      - **Box Plots**
        - Credit Score
        - Age
        - Balance
      - **Correlation Heatmap**

  - **Saving and Uploading Processed Data**
    - Save processed data locally
    - Upload processed data to S3

  - **Main Execution Flow**
    - Process Train Data
    - Process API Data
    - Execute all steps sequentially


In [1]:
import os
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
from io import StringIO
from datetime import datetime

# ---------------------- AWS S3 Configuration ----------------------
S3_BUCKET = "dmml-bank-churn-data"
S3_PREFIX = "raw_data/"
OUTPUT_PREFIX = "processed_data/"
EDA_PREFIX = "eda_reports/"
LOCAL_DIR = "C:/Users/LENOVO/Documents/Study/M.Tech Data Science Pilani/Sem-2/Data Management for Machine Learning-ZG529/Assignment-1/Files"
LOCAL_EDA_DIR = os.path.join(LOCAL_DIR, "EDA_Reports")
LOCAL_PROCESSED_DIR = os.path.join(LOCAL_DIR, "Processed_Data")

os.makedirs(LOCAL_PROCESSED_DIR, exist_ok=True)
os.makedirs(LOCAL_EDA_DIR, exist_ok=True)

s3_client = boto3.client("s3")

def get_latest_s3_files():
    """Fetch latest train and API files from S3 based on timestamps."""
    response = s3_client.list_objects_v2(Bucket=S3_BUCKET, Prefix=S3_PREFIX)
    
    if "Contents" not in response:
        return None, None
    
    files = sorted(response["Contents"], key=lambda x: x["LastModified"], reverse=True)
    
    train_file = next((f["Key"] for f in files if "train" in f["Key"].lower()), None)
    api_file = next((f["Key"] for f in files if "api" in f["Key"].lower()), None)
    
    return train_file, api_file

def download_s3_file(file_key):
    """Download CSV file from S3 and return as Pandas DataFrame."""
    try:
        obj = s3_client.get_object(Bucket=S3_BUCKET, Key=file_key)
        df = pd.read_csv(obj["Body"])
        return df
    except Exception as e:
        logging.error(f"Error downloading {file_key}: {e}")
        return None

def encode_categorical(df):
    """Encode categorical variables."""
    df["Gender"] = df["Gender"].map({"Male": 0, "Female": 1})
    df = pd.get_dummies(df, columns=["Geography"], prefix="Geography", drop_first=False)
    return df

def handle_outliers(df):
    """Detect and handle outliers using IQR method."""
    num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts']
    
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    
    return df

def perform_eda(df):
    """Generate statistics, visualizations, and a correlation heatmap for EDA."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_file = os.path.join(LOCAL_EDA_DIR, f"eda_report_{timestamp}.csv")
    
    # Generate and save statistics
    stats = df.describe()
    stats.to_csv(report_file, index=True)
    logging.info(f"✅ EDA report saved locally: {report_file}")

    # Upload EDA report to S3
    s3_key = f"{EDA_PREFIX}eda_report_{timestamp}.csv"
    csv_buffer = StringIO()
    stats.to_csv(csv_buffer, index=True)
    s3_client.put_object(Bucket=S3_BUCKET, Key=s3_key, Body=csv_buffer.getvalue())
    logging.info(f"✅ EDA report uploaded to S3: {s3_key}")

    # Generate EDA Plots
    plot_types = [
        ("Distribution of Credit Score", "CreditScore", "hist"),
        ("Distribution of Age", "Age", "hist"),
        ("Distribution of Balance", "Balance", "hist"),
        ("Distribution of NumOfProducts", "NumOfProducts", "hist"),
        ("Boxplot of Credit Score", "CreditScore", "box"),
        ("Boxplot of Age", "Age", "box"),
        ("Boxplot of Balance", "Balance", "box"),
    ]
    
    for title, column, plot_type in plot_types:
        plt.figure(figsize=(10, 6))
        if plot_type == "hist":
            sns.histplot(df[column], kde=True, bins=30)
        elif plot_type == "box":
            sns.boxplot(x=df[column])
        plt.title(title)
        plot_path = os.path.join(LOCAL_EDA_DIR, f"{column}_{plot_type}_{timestamp}.png")
        plt.savefig(plot_path)
        plt.close()
        logging.info(f"✅ EDA plot saved locally: {plot_path}")

        # Upload EDA plot to S3
        with open(plot_path, "rb") as img_file:
            s3_client.put_object(Bucket=S3_BUCKET, Key=f"{EDA_PREFIX}{column}_{plot_type}_{timestamp}.png", Body=img_file)
        logging.info(f"✅ EDA plot uploaded to S3: {EDA_PREFIX}{column}_{plot_type}_{timestamp}.png")

    # Generate Correlation Heatmap
    plt.figure(figsize=(12, 8))
    
    # Select only numerical columns for correlation
    numeric_df = df.select_dtypes(include=[np.number])

    # Compute correlation matrix
    corr_matrix = numeric_df.corr()

    # Plot heatmap
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Feature Correlation Heatmap")
    
    heatmap_path = os.path.join(LOCAL_EDA_DIR, f"correlation_heatmap_{timestamp}.png")
    plt.savefig(heatmap_path)
    plt.close()
    logging.info(f"✅ Correlation heatmap saved locally: {heatmap_path}")

    # Upload heatmap to S3
    with open(heatmap_path, "rb") as img_file:
        s3_client.put_object(Bucket=S3_BUCKET, Key=f"{EDA_PREFIX}correlation_heatmap_{timestamp}.png", Body=img_file)
    logging.info(f"✅ Correlation heatmap uploaded to S3: {EDA_PREFIX}correlation_heatmap_{timestamp}.png")

def save_and_upload(df, filename_prefix):
    """Save processed data locally and upload to S3."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    local_path = os.path.join(LOCAL_PROCESSED_DIR, f"{filename_prefix}_{timestamp}.csv")
    df.to_csv(local_path, index=False)
    logging.info(f"✅ Processed data saved locally: {local_path}")
    
    s3_key = f"{OUTPUT_PREFIX}{filename_prefix}_{timestamp}.csv"
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3_client.put_object(Bucket=S3_BUCKET, Key=s3_key, Body=csv_buffer.getvalue())
    logging.info(f"✅ Processed data uploaded to S3: {s3_key}")

def main():
    train_file, api_file = get_latest_s3_files()
    
    if train_file:
        logging.info(f"📂 Found latest train file: {train_file}")
        train_df = download_s3_file(train_file)
        perform_eda(train_df)  # Run EDA before encoding
        train_df = encode_categorical(train_df)
        train_df = handle_outliers(train_df)
        save_and_upload(train_df, "prepared_train_data")
    else:
        logging.error("❌ No train data found.")
        return
    
    if api_file:
        logging.info(f"📂 Found latest API file: {api_file}")
        api_df = download_s3_file(api_file)
        perform_eda(api_df)  # Run EDA before encoding
        api_df = encode_categorical(api_df)
        api_df = handle_outliers(api_df)
        save_and_upload(api_df, "prepared_api_data")
    else:
        logging.error("❌ No API data found.")
        return
    
if __name__ == "__main__":
    main()
