In [None]:
###### Model Deployment in Sagemaker #########
##############################################
import boto3
from sagemaker.model import Model
from sagemaker import get_execution_role, Session 

role = get_execution_role()
session = Session()

ecr_image = '491085388405.dkr.ecr.us-east-1.amazonaws.com/fraud-ecr-28:latest'

model = Model(
    image_uri=ecr_image,
    role=role,
    sagemaker_session=session
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name='fraud-ml-endpoint'
)

----!

In [None]:
######## Inference using one row of Sample Data #########
#########################################################
import boto3
import json

# Create SageMaker runtime client
client = boto3.client('sagemaker-runtime', region_name='us-east-1')  # update region if needed

# Sample input — should match model's expected feature shape and order
payload = {
    "inputs": [[27.0, 1.0, 500987.0, 820870.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0]]
}

# Invoke the deployed endpoint
response = client.invoke_endpoint(
    EndpointName='fraud-ml-endpoint',  # Make sure this matches exactly
    Body=json.dumps(payload),
    ContentType='application/json'
)

# Read and decode the prediction
result = response['Body'].read().decode('utf-8')
print("Prediction:", result)


Prediction: {"predictions":[0]}



In [4]:
!pip install psycopg2-binary boto3 sqlalchemy 



In [5]:
!pip install supabase

Collecting supabase
  Downloading supabase-2.17.0-py3-none-any.whl.metadata (11 kB)
Collecting gotrue==2.12.3 (from supabase)
  Downloading gotrue-2.12.3-py3-none-any.whl.metadata (6.5 kB)
Collecting httpx<0.29,>=0.26 (from supabase)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting postgrest==1.1.1 (from supabase)
  Downloading postgrest-1.1.1-py3-none-any.whl.metadata (3.5 kB)
Collecting realtime==2.6.0 (from supabase)
  Downloading realtime-2.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting storage3==0.12.0 (from supabase)
  Downloading storage3-0.12.0-py3-none-any.whl.metadata (1.9 kB)
Collecting supafunc==0.10.1 (from supabase)
  Downloading supafunc-0.10.1-py3-none-any.whl.metadata (1.2 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from postgrest==1.1.1->supabase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting strenum<0.5.0,>=0.4.9 (from postgrest==1.1.1->supabase)
  Downloading StrEnum-0.4.15-py3-none-any.whl.metadata (5.3 

In [6]:
###### Fetching the Data from Supabase and storing in S3 ########
#################################################################
# print("Supabase raw response:", response)
# print("Supabase data:", response.data)


# print("DataFrame shape:", df.shape)
# print("DataFrame head:\n", df.head())


from supabase import create_client, Client
import pandas as pd
import boto3

# Supabase config
SUPABASE_URL = "https://mcgzvjzuqnjstptfrsuj.supabase.co"
SUPABASE_KEY = "sb_publishable_4vx6B3HjwnEQVwI9SgrJkQ_S32tu5l5"

# Use the service_role key, not anon

# Create Supabase client
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

# Fetch data
response = supabase.table("loan_fraud_analytics").select("*").execute()
df = pd.DataFrame(response.data)

# Save as CSV
csv_path = "/tmp/loan_fraud_data.csv"
df.to_csv(csv_path, index=False)

# Upload to S3
s3 = boto3.client("s3")
bucket_name = "manas-bucket100"
object_key = "inputfile/loan_fraud_data.csv"

s3.upload_file(csv_path, bucket_name, object_key)
print("✅ Upload to S3 complete")


✅ Upload to S3 complete


In [7]:
######## Dropping the target variable ########
##############################################
import boto3
import io

# Drop target column
df_features = df.drop(columns=['loan_default'])

# Convert to CSV buffer
csv_buffer = io.StringIO()
df_features.to_csv(csv_buffer, index=False)

# Upload to S3
s3 = boto3.client('s3')
bucket_name = 'manas-bucket100'  # 🔁 Your bucket name
object_key = 'inputfile/loan_fraud_data.csv'

s3.put_object(Bucket=bucket_name, Key=object_key, Body=csv_buffer.getvalue())

print(f"✅ Features uploaded to s3://{bucket_name}/{object_key}")


✅ Features uploaded to s3://manas-bucket100/inputfile/loan_fraud_data.csv


In [8]:
####### Code for Data processing ########
import pandas as pd
import numpy as np
import os
import boto3
from io import StringIO
bucket_name = "manas-bucket100"
s3_key = "inputfile/loan_fraud_data.csv"

# Step 1: Define S3 download
def load_csv_from_s3(bucket_name, file_key):
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=bucket_name, Key=file_key)
    content = response['Body'].read().decode('utf-8')
    df = pd.read_csv(StringIO(content))
    print(f"✅ Loaded data from s3://{bucket_name}/{file_key}")
    return df

# Step 2: Handle missing values
def handle_missing_values(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)
    return df

# Step 3: Handle outliers
def handle_outliers(df):
    df = df.copy()
    for col in df.select_dtypes(include=np.number).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = np.where(df[col] < lower, lower,
                  np.where(df[col] > upper, upper, df[col]))
    return df

# Step 4: Create dummies
def create_dummies(df):
    df = pd.get_dummies(df, drop_first=True, dtype=int)
    return df

# Step 5: Drop low-variance and ID-like columns
def drop_low_variance_and_id_columns(df, threshold=0.95):
    df = df.copy()
    drop_cols = []

    for col in df.columns:
        if df[col].nunique() <= 1:
            drop_cols.append(col)
        else:
            top_freq_ratio = df[col].value_counts(normalize=True).values[0]
            if top_freq_ratio >= threshold:
                drop_cols.append(col)

    id_like_cols = [col for col in df.columns if col.lower() == 'id'
                    or col.lower().startswith('id')
                    or col.lower().endswith('id')
                    or '_id' in col.lower()
                    or 'id_' in col.lower()]
    
    drop_cols = list(set(drop_cols + id_like_cols))

    if drop_cols:
        print(f"🧹 Dropping low-variance/id-like columns: {drop_cols}")
        df.drop(columns=drop_cols, inplace=True)

    return df

# Step 6: Run all preprocessing
def preprocess_data(df):
    df = drop_low_variance_and_id_columns(df, threshold=0.95)
    df = handle_missing_values(df)
    df = handle_outliers(df)
    df = create_dummies(df)
    return df

# MAIN execution inside SageMaker
def main():
    bucket_name = 'manas-bucket100'
    file_key = 'inputfile/loan_fraud_data.csv'  # full path in S3 bucket

    df = load_csv_from_s3(bucket_name, file_key)
    print(f"📥 Original shape: {df.shape}")

    df = preprocess_data(df)
    print(f"📊 Processed shape: {df.shape}")

    # Save to local file or upload to S3 if needed
    processed_file = "processed_test_data.csv"
    df.to_csv(processed_file, index=False)
    print(f"✅ Preprocessed test data saved: {processed_file}")

    return df

# Run if in script mode
if __name__ == "__main__":
    main()


✅ Loaded data from s3://manas-bucket100/inputfile/loan_fraud_data.csv
📥 Original shape: (1000, 9)
🧹 Dropping low-variance/id-like columns: ['loan_id', 'last_delinq_none']
📊 Processed shape: (1000, 10)
✅ Preprocessed test data saved: processed_test_data.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [10]:
######### Prediction with Processed bulk Data #########
#######################################################
import boto3
import pandas as pd
import json

# Load your CSV
df = pd.read_csv('processed_test_data.csv')  # Each row should have feature values

# Convert the data to list of lists (assuming features are in columns)
data = df.values.tolist()

# Initialize SageMaker client
client = boto3.client('sagemaker-runtime')

# Predict in a loop (or you can do it in batches)
predictions = []
for row in data:
    payload = json.dumps({'inputs': [row]})
    response = client.invoke_endpoint(
        EndpointName='fraud-ml-endpoint',
        Body=payload,
        ContentType='application/json'
    )
    result = json.loads(response['Body'].read().decode()) 
    predictions.append(result)

# Save predictions to CSV
df['prediction_fraudulent'] = predictions
df.to_csv('predictions_fraudulent.csv', index=False)


In [11]:
####### Fetching the inference log from Sagemaker endpoint and saving it in S3 and locally ########
###################################################################################################
import boto3
import pandas as pd 
import json
import uuid
from datetime import datetime

# --- Config ---
endpoint_name = 'fraud-ml-endpoint'
s3_bucket = 'manas-bucket100'
s3_key = f'logs/inference_logs_{datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")}.csv'

# --- Load data ---
df = pd.read_csv('processed_test_data.csv')  # Each row should have feature values
data = df.values.tolist()
columns = df.columns.tolist()

# --- Initialize clients ---
client = boto3.client('sagemaker-runtime')
s3 = boto3.client('s3')

# --- Prediction + Logging ---
log_rows = []
predictions = []

for row in data:
    payload = json.dumps({'inputs': [row]})
    try:
        response = client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=payload,
            ContentType='application/json'
        )
        result = json.loads(response['Body'].read().decode())
        prediction = result["predictions"][0] if "predictions" in result else result
    except Exception as e:
        prediction = None
        print(f"Prediction error: {e}")

    log_entry = {
        'uuid': str(uuid.uuid4()),
        'timestamp': datetime.utcnow().isoformat(),
        **{f: v for f, v in zip(columns, row)},
        'prediction': prediction
    }

    log_rows.append(log_entry)
    predictions.append(prediction)

# --- Save to CSV ---
log_df = pd.DataFrame(log_rows)
log_df.to_csv('inference_logs.csv', index=False)
print("✅ Inference logs saved locally as 'inference_logs.csv'")

# --- Upload to S3 ---
with open("inference_logs.csv", "rb") as f:
    s3.upload_fileobj(f, Bucket=s3_bucket, Key=s3_key)
    print(f"✅ Log file uploaded to s3://{s3_bucket}/{s3_key}")


✅ Inference logs saved locally as 'inference_logs.csv'
✅ Log file uploaded to s3://manas-bucket100/logs/inference_logs_2025-07-28_08-07-58.csv


In [19]:
#######  Drift detection Analysis ##### Drift analysis and charts ######
########################################################################
import pandas as pd
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import os
import numpy as np

# --- Load Data ---
reference_df = pd.read_csv("processed_test_data.csv")
inference_df = pd.read_csv("inference_logs.csv")

# --- Feature Columns ---
feature_columns = [col for col in reference_df.columns if col in inference_df.columns and col not in ['prediction', 'timestamp', 'uuid']]

# --- Create output folder ---
os.makedirs("drift_plots", exist_ok=True)

# --- Drift Detection ---
drift_results = []

for col in feature_columns:
    try:
        ref_values = reference_df[col].dropna()
        inf_values = inference_df[col].dropna()

        # Ensure numeric and non-empty
        if ref_values.empty or inf_values.empty or not np.issubdtype(ref_values.dtype, np.number) or not np.issubdtype(inf_values.dtype, np.number):
            print(f"⚠️ Skipping '{col}' (non-numeric or empty)")
            continue

        # KS-Test
        statistic, p_value = ks_2samp(ref_values, inf_values)
        drift_results.append({
            'feature': col,
            'ks_statistic': statistic,
            'p_value': p_value,
            'drift_detected': p_value < 0.05
        })

        # Plot (side-by-side, independent y-scale)
        fig, axs = plt.subplots(1, 2, figsize=(10, 4), sharey=False)

        axs[0].hist(ref_values, bins=30, color='skyblue', edgecolor='black')
        axs[0].set_title(f'Reference: {col}')
        axs[0].set_xlabel(col)
        axs[0].set_ylabel('Frequency')

        axs[1].hist(inf_values, bins=30, color='salmon', edgecolor='black')
        axs[1].set_title(f'Inference: {col}')
        axs[1].set_xlabel(col)
        axs[1].set_ylabel('Frequency')

        plt.suptitle(f'Distribution Comparison - {col}')
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f'drift_plots/drift_plot_{col}.png')
        plt.close()

    except Exception as e:
        print(f"❌ Error processing feature '{col}': {str(e)}")

# --- Save Report ---
drift_df = pd.DataFrame(drift_results)
drift_df.to_csv('drift_detection_report.csv', index=False)

print("\n✅ Drift detection complete.")
print("📄 Report: drift_detection_report.csv")
print("📊 Plots: saved in 'drift_plots/' folder")



✅ Drift detection complete.
📄 Report: drift_detection_report.csv
📊 Plots: saved in 'drift_plots/' folder


In [20]:
#######  Drift detection Analysis ##### Drift analysis and charts ###### Saving Report in S3 ##### 
############################## Email Notification ##########################################
import pandas as pd
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import os
import numpy as np
import boto3
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication

# --- Configuration ---
BUCKET_NAME = "manas-bucket100"
DRIFT_FOLDER = "drift_reports/"
EMAIL_SENDER = "mmohanty335@gmail.com"
EMAIL_RECEIVER = "mother.manas15@gmail.com"
EMAIL_PASSWORD = "dawl cfoq onpw lpec"
SMTP_SERVER = "smtp.gmail.com"
SMTP_PORT = 587

# --- Load Data ---
reference_df = pd.read_csv("processed_test_data.csv")
inference_df = pd.read_csv("inference_logs.csv")

# --- Feature Columns ---
feature_columns = [col for col in reference_df.columns if col in inference_df.columns and col not in ['prediction', 'timestamp', 'uuid']]

# --- Create output folder ---
os.makedirs("drift_plots", exist_ok=True)

# --- Drift Detection ---
drift_results = []

for col in feature_columns:
    try:
        ref_values = reference_df[col].dropna()
        inf_values = inference_df[col].dropna()

        if ref_values.empty or inf_values.empty or not np.issubdtype(ref_values.dtype, np.number) or not np.issubdtype(inf_values.dtype, np.number):
            continue

        statistic, p_value = ks_2samp(ref_values, inf_values)
        drift_results.append({
            'feature': col,
            'ks_statistic': statistic,
            'p_value': p_value,
            'drift_detected': p_value < 0.05
        })

        fig, axs = plt.subplots(1, 2, figsize=(10, 4), sharey=False)

        axs[0].hist(ref_values, bins=30, color='skyblue', edgecolor='black')
        axs[0].set_title(f'Reference: {col}')
        axs[0].set_xlabel(col)
        axs[0].set_ylabel('Frequency')

        axs[1].hist(inf_values, bins=30, color='salmon', edgecolor='black')
        axs[1].set_title(f'Inference: {col}')
        axs[1].set_xlabel(col)
        axs[1].set_ylabel('Frequency')

        plt.suptitle(f'Distribution Comparison - {col}')
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plot_path = f'drift_plots/drift_plot_{col}.png'
        plt.savefig(plot_path)
        plt.close()

    except Exception as e:
        print(f"❌ Error on {col}: {str(e)}")

# --- Save Report Locally ---
drift_df = pd.DataFrame(drift_results)
report_path = "drift_detection_report.csv"
drift_df.to_csv(report_path, index=False)

# --- Upload to S3 ---
s3 = boto3.client('s3')

# Upload report
s3.upload_file(report_path, BUCKET_NAME, f"{DRIFT_FOLDER}drift_detection_report.csv")

# Upload plots
for file in os.listdir("drift_plots"):
    if file.endswith(".png"):
        s3.upload_file(f"drift_plots/{file}", BUCKET_NAME, f"{DRIFT_FOLDER}plots/{file}")

print("✅ Drift report and plots uploaded to S3")

# --- Email Notification ---
def send_email(subject, body):
    msg = MIMEMultipart()
    msg['From'] = EMAIL_SENDER
    msg['To'] = EMAIL_RECEIVER
    msg['Subject'] = subject

    msg.attach(MIMEText(body, 'plain'))

    with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as server:
        server.starttls()
        server.login(EMAIL_SENDER, EMAIL_PASSWORD)
        server.send_message(msg)

# Determine message
if drift_df['drift_detected'].any():
    subject = "⚠️ Drift Detected in Model Monitoring"
    body = "⚠️ Attention - Drift detected. Please return to Action mode!"
else:
    subject = "✅ No Drift Detected"
    body = "✅ No drift detected. Just chill 😎."

send_email(subject, body)
print(f"📧 Email sent: {subject}")


✅ Drift report and plots uploaded to S3
📧 Email sent: ✅ No Drift Detected


In [21]:
#######  Drift detection Analysis ##### Drift analysis and charts ###### Saving Report in S3 ##### 
############################## Email Notification with attached Report ############################

import pandas as pd
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import os
import numpy as np
import boto3
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders

# --------------------
# CONFIGURATION
# --------------------
# AWS S3
S3_BUCKET_NAME = 'manas-bucket100'
S3_REPORT_PATH = 'drift_reports/drift_detection_report.csv'
S3_PLOTS_DIR = 'drift_reports/plots/'

# Email
EMAIL_SENDER = 'mmohanty335@gmail.com'
EMAIL_RECEIVER = 'mother.manas15@gmail.com'
EMAIL_PASSWORD = 'dawl cfoq onpw lpec'  # e.g., Gmail App Password
SMTP_SERVER = 'smtp.gmail.com'
SMTP_PORT = 587

# --------------------
# LOAD DATA
# --------------------
reference_df = pd.read_csv("processed_test_data.csv")
inference_df = pd.read_csv("inference_logs.csv")
feature_columns = [col for col in reference_df.columns if col in inference_df.columns and col not in ['prediction', 'timestamp', 'uuid']]

os.makedirs("drift_plots", exist_ok=True)

# --------------------
# DRIFT DETECTION
# --------------------
drift_results = []
for col in feature_columns:
    try:
        ref_values = reference_df[col].dropna()
        inf_values = inference_df[col].dropna()

        if ref_values.empty or inf_values.empty or not np.issubdtype(ref_values.dtype, np.number) or not np.issubdtype(inf_values.dtype, np.number):
            print(f"⚠️ Skipping '{col}' (non-numeric or empty)")
            continue

        statistic, p_value = ks_2samp(ref_values, inf_values)
        drift_results.append({
            'feature': col,
            'ks_statistic': statistic,
            'p_value': p_value,
            'drift_detected': p_value < 0.05
        })

        fig, axs = plt.subplots(1, 2, figsize=(10, 4), sharey=False)
        axs[0].hist(ref_values, bins=30, color='skyblue', edgecolor='black')
        axs[0].set_title(f'Reference: {col}')
        axs[1].hist(inf_values, bins=30, color='salmon', edgecolor='black')
        axs[1].set_title(f'Inference: {col}')
        plt.suptitle(f'Distribution Comparison - {col}')
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        local_plot_path = f'drift_plots/drift_plot_{col}.png'
        plt.savefig(local_plot_path)
        plt.close()

    except Exception as e:
        print(f"❌ Error processing feature '{col}': {str(e)}")

# --------------------
# SAVE REPORT LOCALLY
# --------------------
drift_df = pd.DataFrame(drift_results)
drift_report_path = 'drift_detection_report.csv'
drift_df.to_csv(drift_report_path, index=False)

# --------------------
# UPLOAD TO S3
# --------------------
s3 = boto3.client('s3')

# Upload report
s3.upload_file(drift_report_path, S3_BUCKET_NAME, S3_REPORT_PATH)

# Upload plots
for filename in os.listdir("drift_plots"):
    if filename.endswith(".png"):
        local_path = os.path.join("drift_plots", filename)
        s3_key = f"{S3_PLOTS_DIR}{filename}"
        s3.upload_file(local_path, S3_BUCKET_NAME, s3_key)

print("✅ Drift report and plots uploaded to S3")

# --------------------
# SEND EMAIL
# --------------------
def send_email_with_attachment(subject, body, attachment_path):
    msg = MIMEMultipart()
    msg['From'] = EMAIL_SENDER
    msg['To'] = EMAIL_RECEIVER
    msg['Subject'] = subject

    msg.attach(MIMEText(body, 'plain'))

    # Attach CSV
    part = MIMEBase('application', 'octet-stream')
    with open(attachment_path, 'rb') as f:
        part.set_payload(f.read())
    encoders.encode_base64(part)
    part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(attachment_path)}')
    msg.attach(part)

    with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as server:
        server.starttls()
        server.login(EMAIL_SENDER, EMAIL_PASSWORD)
        server.send_message(msg)

# Email logic
if drift_df['drift_detected'].any():
    subject = "⚠️ Drift Detected in Model Monitoring"
    body = "⚠️ Attention — Drift detected. Please return to Action mode!"
else:
    subject = "✅ All Clear - No Drift Detected"
    body = "🎉 Hooray! No drift detected — just chill 😎"

send_email_with_attachment(subject, body, drift_report_path)
print("📧 Email sent with drift report attached.")


✅ Drift report and plots uploaded to S3
📧 Email sent with drift report attached.


In [None]:
#######  Mixed Drift Detection Method ########
##############################################
from scipy.stats import ks_2samp, chi2_contingency 
import numpy as np
import pandas as pd

def detect_drift_numerical(ref_series, new_series):
    stat, p_value = ks_2samp(ref_series, new_series)
    return p_value < 0.05

def detect_drift_categorical(ref_series, new_series):
    ref_counts = ref_series.value_counts().sort_index()
    new_counts = new_series.value_counts().sort_index()
    # Align indexes
    all_categories = sorted(set(ref_counts.index).union(set(new_counts.index)))
    ref_freq = [ref_counts.get(cat, 0) for cat in all_categories]
    new_freq = [new_counts.get(cat, 0) for cat in all_categories]
    stat, p, _, _ = chi2_contingency([ref_freq, new_freq])
    return p < 0.05

def detect_drift_all(ref_df, new_df, numerical_cols, categorical_cols, ordinal_cols): 
    drift_results = {}
    
    for col in numerical_cols:
        drift_results[col] = detect_drift_numerical(ref_df[col], new_df[col])
    
    for col in categorical_cols:
        drift_results[col] = detect_drift_categorical(ref_df[col], new_df[col])
    
    for col in ordinal_cols:
        # Treat ordinal as numeric (simple alternative)
        drift_results[col] = detect_drift_numerical(ref_df[col], new_df[col])
    
    return drift_results
