### Code Overview: Data Validation Pipeline

#### Key Tasks

- **AWS S3 Configuration**
- **Fetching Latest S3 Files (`get_latest_s3_files`)**
- **Downloading Data from S3 (`download_s3_file`)**
- **Validating Data (`validate_data`)**
  - **Missing Values Check**
  - **Data Type Validation**
  - **Duplicate Rows Detection**
  - **Outlier Detection**
    - **Credit Score Check**
    - **Age Range Check**
    - **Tenure Range Check**
    - **Balance Check**
    - **Number of Products Check**
- **Generating Validation Report (`generate_vertical_report`)**
- **Main Execution (`main` function)**
- **Logging & Error Handling**


In [5]:
# import boto3

# s3_client = boto3.client("s3")
# bucket_name = "dmml-bank-churn-data"

# try:
#     response = s3_client.list_objects_v2(Bucket=bucket_name)
#     if "Contents" in response:
#         print("✅ Successfully listed files!")
#         for obj in response["Contents"]:
#             print(obj["Key"])
#     else:
#         print("⚠ No files found.")
# except Exception as e:
#     print(f"❌ Error: {e}")


In [4]:
# Perfectly working code
import os
import logging
import pandas as pd
import boto3
from io import StringIO
from datetime import datetime

# AWS S3 Configuration
S3_BUCKET = "dmml-bank-churn-data"
S3_PREFIX = "raw_data/"
REPORT_PREFIX = "reports/"
LOCAL_REPORT_DIR = "C:/Users/LENOVO/Documents/Study/M.Tech Data Science Pilani/Sem-2/Data Management for Machine Learning-ZG529/Assignment-1/Files/logs"

s3_client = boto3.client("s3")

def get_latest_s3_files():
    """Fetch latest train and API files from S3 based on timestamps."""
    response = s3_client.list_objects_v2(Bucket=S3_BUCKET, Prefix=S3_PREFIX)
    
    if "Contents" not in response:
        return None, None
    
    files = sorted(response["Contents"], key=lambda x: x["LastModified"], reverse=True)
    
    train_file = next((f["Key"] for f in files if "train" in f["Key"].lower()), None)
    api_file = next((f["Key"] for f in files if "api" in f["Key"].lower()), None)
    
    return train_file, api_file

def download_s3_file(file_key):
    """Download CSV file from S3 and return as Pandas DataFrame."""
    try:
        obj = s3_client.get_object(Bucket=S3_BUCKET, Key=file_key)
        df = pd.read_csv(obj["Body"])
        return df
    except Exception as e:
        logging.error(f"Error downloading {file_key}: {e}")
        return None

def validate_data(df, is_train=True):
    """Perform data validation checks and return a structured dictionary."""
    report = {}

    # Missing values
    report["Missing Values"] = df.isnull().sum().to_dict()

    # Expected column data types
    expected_types = {
        "CreditScore": "int64",
        "Age": "int64",
        "Tenure": "int64",
        "Balance": "float64",
        "NumOfProducts": "int64",
        "HasCrCard": "int64",
        "IsActiveMember": "int64",
        "EstimatedSalary": "float64",
    }

    if is_train:
        expected_types["Exited"] = "int64"

    # Invalid Data Types
    report["Invalid Data Types"] = {
        col: str(df[col].dtype) for col in expected_types if col in df and str(df[col].dtype) != expected_types[col]
    }

    # Duplicate rows
    report["Duplicate Rows"] = {"Count": df.duplicated().sum()}

    # Outliers
    report["Outliers"] = {
        "CreditScore (<300 or >850)": df[(df["CreditScore"] < 300) | (df["CreditScore"] > 850)].shape[0],
        "Age (<18 or >100)": df[(df["Age"] < 18) | (df["Age"] > 100)].shape[0],
        "Tenure (<0 or >10)": df[(df["Tenure"] < 0) | (df["Tenure"] > 10)].shape[0],
        "Balance (<0)": df[df["Balance"] < 0].shape[0],
        "NumOfProducts (<1 or >4)": df[(df["NumOfProducts"] < 1) | (df["NumOfProducts"] > 4)].shape[0],
    }

    return report

def generate_vertical_report(train_report, api_report):
    """Create a properly structured validation report with Train and API data in columns."""
    structured_report = []

    for category, train_values in train_report.items():
        api_values = api_report.get(category, {})

        for feature, train_result in train_values.items():
            api_result = api_values.get(feature, "N/A")
            structured_report.append([category, feature, train_result, api_result])

    report_df = pd.DataFrame(structured_report, columns=["Category", "Feature", "Train Data", "API Data"])

    # Append timestamp for versioning
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_filename = f"validation_report_{timestamp}.csv"
    report_path = f"{REPORT_PREFIX}{report_filename}"
    local_report_path = os.path.join(LOCAL_REPORT_DIR, report_filename)

    # Save to S3
    csv_buffer = StringIO()
    report_df.to_csv(csv_buffer, index=False)
    s3_client.put_object(Bucket=S3_BUCKET, Key=report_path, Body=csv_buffer.getvalue())
    logging.info(f"✅ Validation report saved to S3: {report_path}")

    # Save locally
    report_df.to_csv(local_report_path, index=False)
    logging.info(f"✅ Validation report saved locally: {local_report_path}")

def main():
    train_file, api_file = get_latest_s3_files()

    train_report, api_report = {}, {}

    if train_file:
        logging.info(f"📂 Found latest train file: {train_file}")
        train_df = download_s3_file(train_file)
        if train_df is not None:
            train_report = validate_data(train_df, is_train=True)

    if api_file:
        logging.info(f"📂 Found latest API file: {api_file}")
        api_df = download_s3_file(api_file)
        if api_df is not None:
            api_report = validate_data(api_df, is_train=False)

    if train_report or api_report:
        generate_vertical_report(train_report, api_report)
    else:
        logging.error("❌ No valid data found for validation.")

if __name__ == "__main__":
    main()
