## Fraud Detection API: Complete Project Documentation
### From Local Development to Production CI/CD Pipeline

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # Interactive plots (optional)
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# EDA & profiling

# Interactive EDA
# import pandasgui
# from pandasgui import show
# show(your_dataframe)  # Opens an interactive GUI

# Auto-EDA
# from dataprep.eda import create_report
# create_report(your_dataframe).show()

# Lightweight Profiling 
import sweetviz as sv
# sv.analyze(your_dataframe).show_html()

# from pandas_profiling import ProfileReport  # Auto-EDA (install: `pip install ydata-profiling`)
import missingno as msno  # Missing data visualization (install: `pip install missingno`)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(style="darkgrid",font_scale=1.5)
pd.set_option("display.max.columns",None)
pd.set_option("display.max.rows",None)
# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)  # Suppress user warnings
warnings.filterwarnings("ignore", category=FutureWarning)  # Suppress future warnings

# Suppress specific warnings for LGBMClassifier and CatBoostClassifier
import logging
logging.getLogger("catboost").setLevel(logging.ERROR)  # Suppress CatBoost logs
logging.getLogger("lightgbm").setLevel(logging.ERROR)  # Suppress LightGBM logs


In [None]:
# Scaling/normalization
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    LabelEncoder,
    OneHotEncoder,
    PowerTransformer,
    label_binarize
)

# Splitting data
from sklearn.model_selection import train_test_split, StratifiedKFold

# Imputation (handling missing values)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Outlier detection
from sklearn.ensemble import IsolationForest
from scipy import stats  # For Z-score, IQR


In [None]:
# Classic ML
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,  # Faster alternative to CatBoost
    AdaBoostClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier  # Works on Python 3.13.2
from lightgbm import LGBMClassifier  # Works on Python 3.13.2
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

# Neural Networks (optional)
# import tensorflow as tf  # or `pip install tensorflow-cpu`
# from keras.models import Sequential
# from keras.layers import Dense, Dropout

In [None]:
# Metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    mean_squared_error,
    roc_curve
)


In [None]:
# Cross-validation
from sklearn.model_selection import cross_val_score, GridSearchCV

# Calibration
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

# Parallel processing
from joblib import Parallel, delayed


from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

# Warnings (to clean output)
import warnings
warnings.filterwarnings("ignore")

# Time tracking
import time
from tqdm import tqdm  # Progress bars (install: `pip install tqdm`)

In [None]:
# pd.set_option("display.max_columns", None)
df = pd.read_csv("C:/Projects/Fraud Transaction Prediction/Fraud.csv")# change this to your path
df.head(10)

In [None]:
# Cross-validation
from sklearn.model_selection import cross_val_score, GridSearchCV

# Calibration
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

# Parallel processing
from joblib import Parallel, delayed


from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

# Warnings (to clean output)
import warnings
warnings.filterwarnings("ignore")

# Time tracking
import time
from tqdm import tqdm  # Progress bars (install: `pip install tqdm`)

## EDA

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
unique_values = df['step'].unique()
print(unique_values) 

#### Checking for Missing Values

In [None]:
df.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"})

#### Checking for Duplicates 

In [None]:
print("Duplicate Values =",df.duplicated().sum())

#### Checking for numeric data columns 

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.head()

#### Checking for categorical data columns

In [None]:
# df.select_dtypes('object').columns 
# or you can use below
categorical_data = df.select_dtypes(exclude=[np.number])
categorical_data.head()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation_matrix(df, size=8):
    """
    Plots a correlation matrix for numeric columns.
    
    Parameters:
    df : pandas.DataFrame
    size : int - Output figure size
    """
    # Select only numeric columns
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Calculate correlations
    corr = numeric_df.corr()
    
    # Set up the matplotlib figure
    plt.figure(figsize=(size, size))
    
    # Generate a heatmap
    sns.heatmap(corr, 
                annot=True, 
                fmt=".2f", 
                cmap='coolwarm',
                center=0,
                square=True,
                linewidths=.5)
    
    plt.title("Correlation Matrix")
    plt.tight_layout()
    plt.show()

##### it looks like we have categorical data that we need to take care of

### further exploration and visualizations

In [None]:
plotCorrelationMatrix(df1, 8)

In [None]:
numeric_data.corr()

In [None]:
fig = px.imshow(numeric_data.corr(),text_auto=True,aspect="auto")
fig.show()

In [None]:
df['type'].value_counts(ascending=False)

In [None]:
print(df.columns.tolist())

In [None]:
df.describe()

In [None]:
df.hist(figsize=(15,10))
plt.suptitle("Data Distribution", fontsize=16)
plt.figure(figsize=(2, 2))
plt.show()

In [None]:
sns.countplot(x='isFraud',data=df, palette='hls')
plt.title("Imbalanced Fraud vs. Non-Fraud Distribution", fontsize=14)
plt.figure(figsize=(5, 5))
plt.show()
plt.savefig('count_plot')

In [None]:
ratio_df = df[['isFraud', 'amount']].groupby(['isFraud']).count()
ratio_df.plot.pie(y='amount', figsize=(5, 5), labels=['0: nonfraud', '1: fraud'], autopct='%.1f%%', startangle=120, wedgeprops={'width': 0.75},title = 'Imbalanced Fraud vs. Non-Fraud Distribution')

In [None]:
#boxplot
df.boxplot(column=["step","amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"])
plt.title("Box Plot Analysis", fontsize=16)
plt.xticks(rotation=90, fontsize=12)
plt.figure(figsize=(5, 5))
plt.show()

In [None]:
# Correlation test between the qualitative variable 'oldbalanceOrg' and the target variable
import scipy.stats as stats
from sklearn.metrics import roc_auc_score

# 1. Point-Biserial Correlation
corr, p_value = stats.pointbiserialr(df['isFraud'], df['oldbalanceOrg'])
print(f"Point-Biserial Correlation: {corr:.3f} (p-value: {p_value:.3e})")

# 2. Welch's t-test (unequal variances)
fraud_bal = df.loc[df['isFraud'] == 1, 'oldbalanceOrg']
nonfraud_bal = df.loc[df['isFraud'] == 0, 'oldbalanceOrg']
t_stat, p_val = stats.ttest_ind(fraud_bal, nonfraud_bal, equal_var=False)
print(f"t-test: Mean difference = {fraud_bal.mean() - nonfraud_bal.mean():.1f} (p-value: {p_val:.3e})")

# 3. AUC-ROC Evaluation
auc = roc_auc_score(df['isFraud'], df['oldbalanceOrg'])
print(f"AUC: {auc:.3f} (0.5 = random, 1 = perfect)")

In [None]:
A_test=stats.f_oneway(df['oldbalanceOrg'][df['isFraud']==1],df['oldbalanceOrg'][df['isFraud']==0])
print(A_test)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(x='isFraud', y='oldbalanceOrg', data=df, showfliers=False)
plt.yscale('log')  # Use if data is highly skewed
plt.title("Distribution of oldbalanceOrg by Fraud Status")
plt.xlabel("Is Fraud? (0=No, 1=Yes)")
plt.ylabel("Original Balance (log scale)")
plt.show()

In [None]:
# Analyze balance distribution for fraud vs non-fraud
print("Fraudulent transactions balance percentiles:")
print(df[df['isFraud']==1]['oldbalanceOrg'].quantile([0.25, 0.5, 0.75, 0.9]))

print("\nNon-fraudulent transactions balance percentiles:")
print(df[df['isFraud']==0]['oldbalanceOrg'].quantile([0.25, 0.5, 0.75, 0.9]))

In [None]:
# Calculate percentiles for fraud vs non-fraud transactions
fraud_stats = df[df['isFraud']==1]['amount'].describe(percentiles=[.25, .5, .75, .9, .95, .99])
nonfraud_stats = df[df['isFraud']==0]['amount'].describe(percentiles=[.25, .5, .75, .9, .95, .99])

print("Fraudulent Transactions Amount Stats:")
print(fraud_stats[['min', '25%', '50%', '75%', '90%', '95%', '99%', 'max']])

print("\nNon-Fraudulent Transactions Amount Stats:")
print(nonfraud_stats[['min', '25%', '50%', '75%', '90%', '95%', '99%', 'max']])

In [None]:
def get_dynamic_thresholds(df, feature, is_fraud, upper_percentile=0.95, lower_percentile=0.05):
    """Calculate thresholds based on percentiles of fraud/non-fraud data"""
    fraud_vals = df[df['isFraud']==is_fraud][feature]
    return {
        'upper': fraud_vals.quantile(upper_percentile),
        'lower': fraud_vals.quantile(lower_percentile),
        'mean': fraud_vals.mean()
    }

# Example usage for amount
amount_thresholds = {
    'fraud': get_dynamic_thresholds(df, 'amount', is_fraud=1),
    'nonfraud': get_dynamic_thresholds(df, 'amount', is_fraud=0)
}
print("\nAmount Thresholds:")
print(amount_thresholds)

In [None]:
# Set thresholds based on 95th percentile of non-fraud (adjust as needed)
high_amount_thresh = nonfraud_stats['95%']
balance_change_thresh = df[df['isFraud']==0]['oldbalanceOrg'].quantile(0.9)

# Engineered features with dynamic thresholds
df['high_amount_flag'] = (df['amount'] > high_amount_thresh).astype(int)
df['suspicious_balance_change'] = (
    (df['oldbalanceOrg'] - df['newbalanceOrig']) > balance_change_thresh
).astype(int)

# Ratio-based feature with smoothing
df['amount_to_balance_ratio'] = df['amount'] / (df['oldbalanceOrg'] + 1)  # +1 prevents divide-by-zero

# Time-sensitive features (if 'step' is in hours)
# Set thresholds based on 95th percentile of non-fraud (adjust as needed)
high_amount_thresh = nonfraud_stats['95%']
balance_change_thresh = df[df['isFraud']==0]['oldbalanceOrg'].quantile(0.9)

# Engineered features with dynamic thresholds
df['high_amount_flag'] = (df['amount'] > high_amount_thresh).astype(int)
df['suspicious_balance_change'] = (
    (df['oldbalanceOrg'] - df['newbalanceOrig']) > balance_change_thresh
).astype(int)

# Ratio-based feature with smoothing
df['amount_to_balance_ratio'] = df['amount'] / (df['oldbalanceOrg'] + 1)  # +1 prevents divide-by-zero

# # Time features 
# # Feature 1: Hour of day (1-24, where 1 = 00:00-00:59 of any day)
# df['hour_of_day'] = ((df['step'] - 1) % 24) + 1  # Converts to 1-24 range

# # Feature 2: Day of simulation (1-31, since 743 hours ≈ 30.96 days)
# df['day'] = ((df['step'] - 1) // 24) + 1  # 1-based day count

# # Feature 3: Day of week (0=Monday to 6=Sunday)
# df['day_of_week'] = ((df['step'] - 1) // 24) % 7  # 0-based weekday

# # Feature 4: Weekend flag (1 if Saturday/Sunday)
# df['is_weekend'] = ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)

# # Feature 5: Transaction burst (1 if same user transacts again within 1 hour)
# df['txn_burst'] = (df.groupby('nameOrig')['step'].diff() == 1).astype(int)

# # Feature 6: Hours since last transaction (NaN for first txn per user)
# df['hours_since_last_txn'] = df.groupby('nameOrig')['step'].diff()

# # Feature 7: Transactions per user in last 24 hours (rolling window)
# df['txn_count_24h'] = df.groupby('nameOrig')['step'].transform(
#     lambda x: x.rolling(24, min_periods=1).count()
# )

In [None]:
# Check fraud rates in new features
print("\nFraud Rates by Engineered Features:")
print(df.groupby('high_amount_flag')['isFraud'].mean())
print(df.groupby('suspicious_balance_change')['isFraud'].mean())

# Visual confirmation
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))
plt.subplot(121)
df.boxplot(column='amount_to_balance_ratio', by='isFraud', showfliers=False)
plt.ylim(0, 5)  # Focus on 0-500% range
plt.subplot(122)
df.groupby('hour_of_day')['isFraud'].mean().plot()
plt.title("Fraud Rate by Hour of Day")
plt.tight_layout()
plt.show()

In [None]:
def engineer_features(df, amount_percentile=0.95, balance_percentile=0.9):
    """Complete feature engineering with dynamic thresholds"""
    # Calculate thresholds
    amt_thresh = df[df['isFraud']==0]['amount'].quantile(amount_percentile)
    bal_thresh = df[df['isFraud']==0]['oldbalanceOrg'].quantile(balance_percentile)
    
    # Transaction features
    df['amount_to_balance'] = df['amount'] / (df['oldbalanceOrg'] + 1)
    df['high_amount_flag'] = (df['amount'] > amt_thresh).astype(int)
    
    # Balance features
    df['balance_change_abs'] = df['oldbalanceOrg'] - df['newbalanceOrig']
    df['balance_change_ratio'] = df['balance_change_abs'] / (df['oldbalanceOrg'] + 1)
    df['suspicious_withdrawal'] = (
        (df['balance_change_abs'] > bal_thresh) & 
        (df['amount_to_balance'] > 0.5)
    ).astype(int)
    
    # Type-specific features
    df['large_cashout'] = (
        (df['type'] == 'CASH_OUT') & 
        (df['amount_to_balance'] > 0.7)
    ).astype(int)
    
    return df

# Apply to your dataframe
df = engineer_features(df)

In [None]:

# Time features 
# Feature 1: Hour of day (1-24, where 1 = 00:00-00:59 of any day)
df['hour_of_day'] = ((df['step'] - 1) % 24) + 1  # Converts to 1-24 range

# Feature 2: Day of simulation (1-31, since 743 hours ≈ 30.96 days)
df['day'] = ((df['step'] - 1) // 24) + 1  # 1-based day count

# Feature 3: Day of week (0=Monday to 6=Sunday)
df['day_of_week'] = ((df['step'] - 1) // 24) % 7  # 0-based weekday

# Feature 4: Weekend flag (1 if Saturday/Sunday)
df['is_weekend'] = ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)

# Feature 5: Transaction burst (1 if same user transacts again within 1 hour)
df['txn_burst'] = (df.groupby('nameOrig')['step'].diff() == 1).astype(int)

# Feature 6: Hours since last transaction (NaN for first txn per user)
df['hours_since_last_txn'] = df.groupby('nameOrig')['step'].diff()

# Feature 7: Transactions per user in last 24 hours (rolling window)
df['txn_count_24h'] = df.groupby('nameOrig')['step'].transform(
    lambda x: x.rolling(24, min_periods=1).count()
)

In [None]:
df.head(10)

#### below features are just for exploration purposes, the purpose of this exercise is to learn run and keep it simple without going into extreme details

In [None]:
# # Feature engineering
# # Set threshold at 90th percentile of non-fraud transactions
# balance_threshold = df[df['isFraud']==0]['oldbalanceOrg'].quantile(0.9)
# txn_ratio_threshold = 0.5  # 50% of balance

# df['high_risk_balance'] = (df['oldbalanceOrg'] > balance_threshold).astype(int)
# df['suspicious_withdrawal'] = (
#     (df['oldbalanceOrg'] > balance_threshold) & 
#     (df['amount'] > txn_ratio_threshold * df['oldbalanceOrg'])
# )

In [None]:
# # Flag transactions in top 5% of amounts
# amount_threshold = df['amount'].quantile(0.95)
# df['large_txn_flag'] = (df['amount'] > amount_threshold).astype(int)

# # Combined flag
# df['high_risk_combo'] = (
#     df['high_risk_balance'] | 
#     df['large_txn_flag']
# ).astype(int)

In [None]:
# # Feature Engineering 
# # High-risk balance threshold (adjust based on quartiles)  
# df['high_risk_balance'] = (df['oldbalanceOrg'] > 1_000_000).astype(int)  

# # Interaction with transaction amount  
# df['large_balance_large_txn'] = (df['oldbalanceOrg'] > 500_000) & (df['amount'] > 0.9 * df['oldbalanceOrg'])  

In [None]:
# # List of all features we created 
# engineered_features = [
#     'high_risk_balance',
#     'large_balance_large_txn', 
#     'large_txn_flag',
#     'high_risk_combo',
#     'suspicious_withdrawal'
# ]

# # Safely remove columns
# df = df.drop(columns=[col for col in engineered_features if col in df.columns], errors='ignore')

# # Verify removal
# print("Current columns:", df.columns.tolist())

#### First: Encode Categorical Variables
(Convert text → numbers before handling imbalance)

In [None]:
df = pd.get_dummies(df, columns=['type'], prefix='type') # handling the column 'type' first 
# Verify encoding
print(df.columns)

#### Handling High-Cardinality ID Columns (nameOrig, nameDest)
These columns appear to be transaction IDs (unique identifiers). Since they have extremely high cardinality (millions of unique values), do NOT one-hot encode them. Instead:

In [None]:
# df = df.drop(['nameOrig', 'nameDest'], axis=1)  # Remove ID columns
# Remove non-feature columns (including original categorical)
X = df.drop(['isFraud', 'nameOrig', 'nameDest'], axis=1)
y = df['isFraud']

# Check all remaining features are numeric
print(X.dtypes)

In [None]:
# Randomly sample 100K rows for feature analysis
sample_idx = np.random.choice(len(X), 100000, replace=False)
X_sample = X.iloc[sample_idx]
y_sample = y.iloc[sample_idx]

rf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
rf.fit(X_sample, y_sample)

In [None]:
rf = RandomForestClassifier(
    n_estimators=30,       # Reduced from 50
    max_depth=5,           # Shallower trees
    min_samples_leaf=100,  # Larger leaf nodes
    n_jobs=-1,             # Use all CPU cores
    random_state=42
)
rf.fit(X, y)  # Now runs much faster

In [None]:
# Faster permutation importance (works with partial data)
from sklearn.inspection import permutation_importance

result = permutation_importance(
    rf, X_sample, y_sample,
    n_repeats=3,
    random_state=42,
    n_jobs=-1
)

sorted_idx = result.importances_mean.argsort()[::-1]
plt.barh(X.columns[sorted_idx][:15], result.importances_mean[sorted_idx][:15])
plt.title("Permutation Importance")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X, y)  # Now works with numeric-only data

# Plot importance
pd.Series(rf.feature_importances_, index=X.columns).nlargest(15).plot(kind='barh')
plt.title("Top Predictive Features")
plt.show()

### Check if the data is imbalanced

In [None]:
fraud_count = df['isFraud'].value_counts()
fraud_count

In [None]:
fraud_percentage = df['isFraud'].value_counts(normalize=True) * 100
fraud_percentage

#### it looks like the data is highly imbalanced

In [None]:
df.head(10)

#### First, Split Your Data (Critical!)

In [None]:
x = df.drop(['isFraud'], axis = 1)
y = df['isFraud']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42,stratify=y)

In [None]:
print("x_train - >  ",x_train.shape)
print("y_train - >  ",y_train.shape)
print("x_test  - >  ",x_test.shape)
print("y_test  - >  ",y_test.shape)

#### Scale/transform

In [None]:
pt = PowerTransformer(method='yeo-johnson')

In [None]:
x_train_scaled = pt.fit_transform(x_train)
x_test_scaled = pt.transform(x_test)

#### handle Imbalance 

In [None]:
# encoder = {}
# for i in df.select_dtypes('object').columns:
#     encoder[i] = LabelEncoder()
#     df[i] = encoder[i].fit_transform(df[i])

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.3, random_state=42)  # 3:10 fraud/non-fraud ratio
X_res, y_res = smote.fit_resample(x_train, y_train)

print("Resampled class counts:", y_res.value_counts())

### Train and Predict

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

def evaluate_model(model, X_train, y_train, X_test, y_test, name):
    """Train and evaluate a single model, returning metrics and plots."""
    try:
        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        
        # Metrics
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)
        
        # Plot
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Predicted Non-Fraud', 'Predicted Fraud'],
                   yticklabels=['Actual Non-Fraud', 'Actual Fraud'])
        plt.title(f'{name}\nFP: {cm[0,1]} | FN: {cm[1,0]}')
        plt.gca().add_patch(plt.Rectangle((1, 0), 1, 1, fill=False, edgecolor='red', lw=2))
        plt.gca().add_patch(plt.Rectangle((0, 1), 1, 1, fill=False, edgecolor='orange', lw=2))
        plt.show()
        
        # Return metrics
        return {
            'Model': name,
            'Recall (Fraud)': report['1']['recall'],
            'Precision (Fraud)': report['1']['precision'],
            'F1 (Fraud)': report['1']['f1-score'],
            'ROC AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else None,
            'Type I (FP)': cm[0, 1],
            'Type II (FN)': cm[1, 0]
        }
    except Exception as e:
        print(f"❌ Failed for {name}: {str(e)}")
        return None

In [None]:
results = []  
from xgboost import XGBClassifier
xgb = XGBClassifier(
    scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
    n_estimators=100,
    max_depth=6,
    tree_method='hist',  # Faster than exact
    eval_metric='aucpr',
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(xgb, X_res, y_res, x_test_scaled, y_test, "XGBoost (Fast)"))
del xgb

In [None]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(
    class_weight='balanced',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.05,
    feature_fraction=0.8,
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(lgbm, X_res, y_res, x_test_scaled, y_test, "LightGBM"))
del lgbm

In [None]:
# 3. Random Forest (Balanced)
# ==============================================
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=50,
    max_depth=7,
    max_samples=0.8,
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(rf, X_res, y_res, x_test_scaled, y_test, "Random Forest"))
del rf

In [None]:
# 4. Gradient Boosting (Lightweight)
# ==============================================
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    random_state=42
)
results.append(evaluate_model(gb, X_res, y_res, x_test_scaled, y_test, "Gradient Boosting"))
del gb

In [None]:
# 5. Logistic Regression (Fast)
# ==============================================
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(
    class_weight='balanced',
    solver='liblinear',
    penalty='l1',
    max_iter=200,
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(logreg, X_res, y_res, x_test_scaled, y_test, "Logistic Reg"))
del logreg

In [None]:
# # 3. Support Vector Classifier - SVC (Caution)
# # ==============================================
# from sklearn.svm import SVC
# svc = SVC(
#     class_weight='balanced',
#     probability=True,
#     kernel='rbf',
#     gamma='scale',
#     random_state=42,
#     cache_size=1000  # Helps with memory
# )
# svc_results = evaluate_model(svc, X_res, y_res, x_test_scaled, y_test, "SVC")
# if svc_results: results.append(svc_results)
# del svc

In [None]:
# 6. AdaBoost (Quick)
# ==============================================
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=0.5,
    random_state=42
)
results.append(evaluate_model(ada, X_res, y_res, x_test_scaled, y_test, "AdaBoost"))
del ada

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(
    class_weight='balanced',
    n_estimators=50,
    max_depth=7,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(et, X_res, y_res, x_test_scaled, y_test, "Extra Trees"))
del et

#### Combine and Rank Rsults

In [None]:
# Convert results to DataFrame - THIS MUST COME AFTER ALL MODELS RUN
results_df = pd.DataFrame([r for r in results if r is not None])

# Rank models by fraud detection performance
ranked_df = results_df.sort_values([
    'Recall (Fraud)', 
    'F1 (Fraud)',
    'Type II (FN)'
], ascending=[False, False, True])

# Add rank column
ranked_df['Rank'] = range(1, len(ranked_df)+1)

# Highlight top 3 models
def highlight_top3(s):
    top3 = s.nlargest(3).index
    return ['background-color: #FFFF00' if i in top3 else '' for i in range(len(s))]

ranked_df.style.apply(highlight_top3, subset=['Recall (Fraud)', 'F1 (Fraud)'])

### Now that the model is complete here are the steps to Modularize the code and break it down as follows 

### Step-1 Testing the Model Locally

In [None]:
fraud_prediction/
├── src/
│   ├── config.py
│   ├── preprocess.py
│   ├── feature_engineer.py
│   ├── predict.py
│   ├── train.py
│   ├── evaluate.py
│   ├── app.py
│   └── requirements.txt
├── data/
│   └── Fraud.csv
├── models/ (will be created)
└── logs/ (will be created)

#### Modularized Code Breakdown

##### 1- Config.py

In [None]:
# config.py
from pathlib import Path
from datetime import datetime

# Project setup
PROJECT_ROOT = Path(__file__).parent.parent
DATA_PATH = PROJECT_ROOT / 'data' / 'Fraud.csv'
MODEL_PATH = PROJECT_ROOT / 'models' / 'fraud_model.joblib'

# Data loading
N_ROWS = 100000 

# Feature engineering
AMOUNT_PERCENTILE = 0.95
BALANCE_PERCENTILE = 0.9

# Model training
RANDOM_STATE = 42
TEST_SIZE = 0.3
SMOTE_RATIO = 0.3

class AppConfig:
    # API Settings
    HOST = "0.0.0.0"
    PORT = 8080
    DEBUG = False
    
    # Model Monitoring
    PREDICTION_LOGS = PROJECT_ROOT / "logs" / "predictions.log"
    DRIFT_THRESHOLD = 0.15

    @classmethod
    def ensure_dirs_exist(cls):
        """Create required directories"""
        (PROJECT_ROOT / "logs").mkdir(exist_ok=True)
        (PROJECT_ROOT / "models").mkdir(exist_ok=True)

# Initialize directories
AppConfig.ensure_dirs_exist()

##### 2- Preprocess.py

In [None]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from config import DATA_PATH, N_ROWS, TEST_SIZE, RANDOM_STATE, SMOTE_RATIO
from feature_engineer import engineer_features
import config  

def load_and_preprocess():
    """Load data and apply preprocessing"""
    df = pd.read_csv(DATA_PATH, nrows=N_ROWS)
    df = engineer_features(df)
    
    X = df.drop(['isFraud', 'nameOrig', 'nameDest'], axis=1)
    y = df['isFraud']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    
    # Scaling
    pt = PowerTransformer(method='yeo-johnson')
    X_train_scaled = pt.fit_transform(X_train)
    X_test_scaled = pt.transform(X_test)
    
    # Resampling
    smote = SMOTE(sampling_strategy=SMOTE_RATIO, random_state=RANDOM_STATE)
    X_res, y_res = smote.fit_resample(X_train_scaled, y_train)
    
    return X_res, y_res, X_test_scaled, y_test, pt

##### 3- feature_engineer.py

In [None]:
import pandas as pd
import numpy as np
from config import AMOUNT_PERCENTILE, BALANCE_PERCENTILE

def engineer_features(df):
    """Feature engineering pipeline"""
    # Transaction features
    amt_thresh = df[df['isFraud']==0]['amount'].quantile(AMOUNT_PERCENTILE)
    bal_thresh = df[df['isFraud']==0]['oldbalanceOrg'].quantile(BALANCE_PERCENTILE)
    
    df['amount_to_balance'] = df['amount'] / (df['oldbalanceOrg'] + 1)
    df['high_amount_flag'] = (df['amount'] > amt_thresh).astype(int)
    df['balance_change_abs'] = df['oldbalanceOrg'] - df['newbalanceOrig']
    df['suspicious_withdrawal'] = (
        (df['balance_change_abs'] > bal_thresh) & 
        (df['amount_to_balance'] > 0.5)
    ).astype(int)
    
    # Time features
    df['hour_of_day'] = ((df['step'] - 1) % 24) + 1
    df['day_of_week'] = ((df['step'] - 1) // 24) % 7
    df['is_weekend'] = ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)
    
    # Categorical encoding
    df = pd.get_dummies(df, columns=['type'], prefix='type')
    
    return df

##### 4- Predict.py

In [None]:
# predict.py
from joblib import load
import pandas as pd
from config import MODEL_PATH, AppConfig, AMOUNT_PERCENTILE, BALANCE_PERCENTILE 
import json
from datetime import datetime
import logging

class FraudPredictor:
    def __init__(self):
        self._load_model()
        self._init_logging()
        
    def _load_model(self):
        """Load model artifacts with exact feature validation"""
        try:
            artifacts = load(MODEL_PATH)
            self.model = artifacts['model']
            self.pt = artifacts['transformer']
            
            # MUST match EXACTLY what was used in training
            self.feature_order = [
                'step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
                'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud',
                'amount_to_balance', 'high_amount_flag', 'balance_change_abs',
                'suspicious_withdrawal', 'hour_of_day', 'day_of_week', 'is_weekend',
                'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER'
            ]
            print("✅ Model loaded with features:", self.feature_order)
        except Exception as e:
            raise RuntimeError(f"Model loading failed: {str(e)}")

    def _init_logging(self):
        """Set up prediction logging"""
        logging.basicConfig(
            filename=AppConfig.PREDICTION_LOGS,
            format='%(asctime)s - %(message)s',
            level=logging.INFO
        )
        self.logger = logging.getLogger(__name__)

    def _validate_input(self, data: dict) -> None:
        """Ensure minimum required fields exist"""
        required_fields = {
            'amount', 'oldbalanceOrg', 'newbalanceOrig',
            'oldbalanceDest', 'newbalanceDest', 'step',
            'isFlaggedFraud', 'type'
        }
        missing = required_fields - set(data.keys())
        if missing:
            raise ValueError(f"Missing required fields: {missing}")

    def log_prediction(self, data: dict, prediction: int):
        """Log prediction with context"""
        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "input": {k: v for k, v in data.items() if k != 'type'},  # Sanitize
            "prediction": prediction,
            "model_version": "1.0.0" 
        }
        self.logger.info(json.dumps(log_entry))

    def preprocess(self, transaction_data: dict):
        """Recreate features EXACTLY as during training"""
        df = pd.DataFrame([transaction_data])
        
        # Feature engineering (MUST match training)
        df['amount_to_balance'] = df['amount'] / (df['oldbalanceOrg'] + 1)
        df['high_amount_flag'] = (df['amount'] > 10000).astype(int)
        df['balance_change_abs'] = df['oldbalanceOrg'] - df['newbalanceOrig']
        df['suspicious_withdrawal'] = (
            (df['balance_change_abs'] > 5000) & 
            (df['amount_to_balance'] > 0.5)
        ).astype(int)
        
        # Time features
        df['hour_of_day'] = ((df['step'] - 1) % 24) + 1
        df['day_of_week'] = ((df['step'] - 1) // 24) % 7
        df['is_weekend'] = ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)
        
        # Transaction type handling
        valid_types = ['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER']
        for t in valid_types:
            df[f'type_{t}'] = 0
        if 'type' in df and df['type'].iloc[0] in valid_types:
            df[f'type_{df["type"].iloc[0]}'] = 1
        
        # Verify feature match
        missing = set(self.feature_order) - set(df.columns)
        if missing:
            raise ValueError(f"Missing features after processing: {missing}")
            
        return self.pt.transform(df[self.feature_order])

    def predict(self, transaction_data: dict) -> int:
        try:
            self._validate_input(transaction_data)
            processed = self.preprocess(transaction_data)
            prediction = int(self.model.predict(processed)[0])
            self.log_prediction(transaction_data, prediction)
            return prediction
        except Exception as e:
            self.logger.error(f"Prediction failed: {str(e)}")
            raise RuntimeError(f"Prediction failed: {str(e)}")


##### 5-Train 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from joblib import dump
from preprocess import load_and_preprocess
from config import MODEL_PATH, RANDOM_STATE
from sklearn.metrics import classification_report
import pandas as pd  # Added for feature order logging

def train_model():
    print("🚀 Starting model training...")
    
    # Load and preprocess data
    print("🔍 Loading and preprocessing data...")
    X_res, y_res, X_test, y_test, pt = load_and_preprocess()
    
    # Initialize model
    print("🤖 Initializing Random Forest model...")
    model = RandomForestClassifier(
        class_weight='balanced',
        n_estimators=50,
        max_depth=7,
        max_samples=0.8,
        n_jobs=-1,
        random_state=RANDOM_STATE
    )
    
    # Train model
    print("⚡ Training model...")
    model.fit(X_res, y_res)
    
    # Evaluate
    print("🧪 Evaluating model...")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    # Log feature order used in training
    print("\n=== FEATURE ORDER USED IN TRAINING ===")
    if isinstance(X_res, pd.DataFrame):
        print(X_res.columns.tolist())
    else:
        # If X_res is numpy array, we need to reconstruct feature names
        # This assumes your load_and_preprocess() returns DataFrames
        print("Warning: Features are numpy arrays - cannot display names")
    
    # Save model
    dump({'model': model, 'transformer': pt}, MODEL_PATH)
    print(f"\n✅ Model successfully saved to {MODEL_PATH}")
    
if __name__ == "__main__":
    train_model()

##### 6-Evaluate.py

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.show()

#### Now Lets Build Flask API (app.py)

In [None]:
from flask import Flask, request, jsonify
from predict import FraudPredictor
import traceback
from datetime import datetime
from config import AppConfig

app = Flask(__name__)

# Initialize predictor
try:
    predictor = FraudPredictor()
except Exception as e:
    print(f"❌ Failed to initialize predictor: {str(e)}")
    raise

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json()
        
        if not data:
            return jsonify({"error": "No JSON provided"}), 400
            
        # Add request metadata
        request_meta = {
            "timestamp": datetime.now().isoformat(),
            "endpoint": "predict",
            "client_ip": request.remote_addr
        }
        
        # Make prediction
        prediction = predictor.predict(data)
        
        return jsonify({
            "fraud_prediction": prediction,
            "meta": request_meta,
            "model_info": {
                "version": "1.0.0",
                "type": "RandomForest"
            },
            "status": "success"
        })
        
    except ValueError as e:
        return jsonify({
            "error": str(e),
            "status": "input_error"
        }), 400
        
    except Exception as e:
        return jsonify({
            "error": str(e),
            "status": "server_error"
        }), 500

@app.route('/health', methods=['GET'])
def health():
    return jsonify({
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "model_loaded": True
    })

if __name__ == '__main__':
    app.run(host=AppConfig.HOST, port=AppConfig.PORT, debug=AppConfig.DEBUG)

print(" * Application started successfully!")

#### Open your CLI in your folder location (in my case I am using cmd) and here are the steps along with the following commands

##### 1-Install dependencies:

In [None]:
# pip install -r src\requirements.txt

##### 2- Run the training  

In [None]:
# python src\train.py

##### 3- Start the API 

In [None]:
# python src\app.py

##### Expected output should look like this 

In [None]:
C:\Projects\fraud_detection\src>python app.py
✅ Model loaded with features: ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud', 'amount_to_balance', 'high_amount_flag', 'balance_change_abs', 'suspicious_withdrawal', 'hour_of_day', 'day_of_week', 'is_weekend', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
 * Serving Flask app 'app'
 * Debug mode: off


##### Now let's put this Model to Test! Copy the features in exact order and give it values. Now in a new CMD window execute the following

In [None]:
curl -X POST http://localhost:8080/predict ^
-H "Content-Type: application/json" ^
-d "{^
\"step\": 1,^
\"amount\": 9839.64,^
\"oldbalanceOrg\": 170136.0,^
\"newbalanceOrig\": 160296.36,^
\"oldbalanceDest\": 0.0,^
\"newbalanceDest\": 9839.64,^
\"isFlaggedFraud\": 0,^
\"type\": \"CASH_OUT\",^
\"amount_to_balance\": 0.0578,^
\"high_amount_flag\": 1,^
\"balance_change_abs\": 9839.64,^
\"suspicious_withdrawal\": 0,^
\"hour_of_day\": 1,^
\"day_of_week\": 0,^
\"is_weekend\": 0^
}"

### Step-2 Dockerizing the app

#### Create Your serve.py file serves two main purposes:

#### 1- Replaces Flask's Development Server

##### Flask's built-in server (app.run()) is not suitable for production (slow, insecure, single-threaded).

##### waitress is a production-ready WSGI server that handles multiple requests efficiently.

#### 2- Standardizes the Startup Process

##### Provides a consistent entry point for Docker to launch your app.

##### Ensures directories exist and logging is configured before starting.

In [None]:
# Serve.py
from waitress import serve
from app import app  # Import your Flask app
from config import PROJECT_ROOT
import os
import logging

# Production configuration
MODEL_DIR = PROJECT_ROOT / 'models'
os.makedirs(MODEL_DIR, exist_ok=True)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('waitress')
logger.info('Starting server...')

if __name__ == '__main__':
    print(f"🚀 Serving fraud detection API on http://localhost:8080")
    serve(app, host='0.0.0.0', port=8080)  # Production-ready server

#### Create Dockerfile in your project root. and makesure to remove .txt 

#### Core Purpose of the Dockerfile

##### Your Dockerfile is a blueprint that:

##### - Creates a reproducible environment for your app

##### - Packages all dependencies, code, and models into a single deployable unit

##### - Standardizes how your app runs across different machines

In [None]:
fraud_detection/
├── src/
│   ├── app.py               # Flask app
│   ├── config.py            # Config
│   ├── ...                  # Other Python files
│   ├── requirements.txt     # Dependencies
    └── Dockerfile               # Your Dockerfile thatès where it would be created
├── models/
│   └── fraud_model.joblib   # Your trained model
└── data/
    └── Fraud.csv            # Dataset

In [2]:
# Dockerfile
FROM python:3.10-slim
WORKDIR /app

# Copy model file (ensure it's in your project directory)
COPY fraud_model.joblib /models/

# Copy requirements first for caching
COPY requirements.txt .
RUN pip install --upgrade pip setuptools wheel
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the app
COPY . .

# Verify files (debugging)
RUN ls -l /app/
RUN ls -l /models/

# Use Waitress for production WSGI server
CMD ["python", "-c", "from waitress import serve; from app import app; print('Launching app...'); serve(app, host='0.0.0.0', port=8080)"]

#### Now Open CMD in the folder location and execute this command to build the docker

#### 1. Local Docker Testing (Dev Environment)

##### Step 1: Build the Docker Image

In [None]:
# Navigate to your project directory
# docker build --no-cache -t fraud-detection-api .

##### Step 2: Verify the Image

In [3]:
# docker images <-- execute after the built is successfully done it should return something like this 
C:\Projects\fraud_detection\src>docker images
REPOSITORY            TAG       IMAGE ID       CREATED         SIZE
fraud-detection-api   latest    5e4c6603ae57   4 minutes ago   686MB

In [None]:
# If needed commands
docker ps -a  # List all containers
docker stop <container-id>  # Stop the one using port
docker rm <container-id>  # Remove it

##### Step 3: Run the Container

In [None]:
docker run -it -p 8080:8080 ^
  -v "%cd%\models":/app/models ^
  -v "%cd%\logs":/app/logs ^
  --name fraud-container ^
  fraud-detection-api

In [None]:
Results should look somthing like this 
C:\Projects\fraud_detection\src>docker run -it -p 8080:8080 ^
More?   -v "%cd%\models":/app/models ^
More?   -v "%cd%\logs":/app/logs ^
More?   --name fraud-container ^
More?   fraud-detection-api
✅ Model loaded with features: ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud', 'amount_to_balance', 'high_amount_flag', 'balance_change_abs', 'suspicious_withdrawal', 'hour_of_day', 'day_of_week', 'is_weekend', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
 * Application started successfully!
Launching app...


##### In a NEW CMD Window, verify it is running

In [5]:
# docker ps

##### Step 4: Test the API 

In [None]:
curl -X POST http://localhost:8080/predict ^
-H "Content-Type: application/json" ^
-d "{\"step\":1,\"amount\":1000,\"oldbalanceOrg\":5000,\"newbalanceOrig\":4000,\"oldbalanceDest\":0,\"newbalanceDest\":1000,\"isFlaggedFraud\":0,\"amount_to_balance\":0.2,\"high_amount_flag\":0,\"balance_change_abs\":1000,\"suspicious_withdrawal\":0,\"hour_of_day\":10,\"day_of_week\":2,\"is_weekend\":0,\"type\":\"CASH_OUT\",\"type_CASH_IN\":0,\"type_CASH_OUT\":1,\"type_DEBIT\":0,\"type_PAYMENT\":0,\"type_TRANSFER\":0}"


In [None]:
Your output should look something like this 
C:\Projects\fraud_detection\src>curl -X POST http://localhost:8080/predict ^
More? -H "Content-Type: application/json" ^
More? -d "{\"step\":1,\"amount\":1000,\"oldbalanceOrg\":5000,\"newbalanceOrig\":4000,\"oldbalanceDest\":0,\"newbalanceDest\":1000,\"isFlaggedFraud\":0,\"amount_to_balance\":0.2,\"high_amount_flag\":0,\"balance_change_abs\":1000,\"suspicious_withdrawal\":0,\"hour_of_day\":10,\"day_of_week\":2,\"is_weekend\":0,\"type\":\"CASH_OUT\",\"type_CASH_IN\":0,\"type_CASH_OUT\":1,\"type_DEBIT\":0,\"type_PAYMENT\":0,\"type_TRANSFER\":0}"
{"fraud_prediction":0,"meta":{"client_ip":"172.17.0.1","endpoint":"predict","timestamp":"2025-04-18T01:14:44.901829"},"model_info":{"type":"RandomForest","version":"1.0.0"},"status":"success"}

##### View Real-Time Logs, Validation:

In [None]:
docker logs -f fraud-container

In [None]:
Additional details In case you need to rest 
docker stop fraud-container
docker rm fraud-container
docker rmi fraud-detection-api
docker build -t fraud-detection-api .

#### Transition to Cloud Deployment: The Model is Now Portable
Now the fraud detection model is fully containerized and ready for deployment to any cloud platform (AWS, Azure, GCP) or on-premises Kubernetes cluster.

##### By Dockerizing the model, I’ve achieved:
✅ Reproducibility: Anyone with my Docker image can run the exact same environment (dependencies, code, and configurations).

✅ Portability: The same image runs identically on any cloud or local machine.

✅ Scalability: Ready to deploy on Kubernetes for high availability and load balancing.

#### 2- Local Kubernetes (Docker Desktop)
this will mimic how it would work if you deploy it using any of the cloud platforms (AWS, GCP, AZURE) using kubernetes 

#### Step: 1 In the folder location, Create YAML files 

#### deployement.yaml

In [None]:
apiVersion: apps/v1
kind: Deployment
metadata:
  name: fraud-detection-api
spec:
  replicas: 1  # Single replica for local testing
  selector:
    matchLabels:
      app: fraud-detection
  template:
    metadata:
      labels:
        app: fraud-detection
    spec:
      containers:
      - name: fraud-api
        image: fraud-detection-api:latest  # Uses local Docker image
        ports:
        - containerPort: 8080
        volumeMounts:
        - mountPath: /app/models
          name: models-volume
        - mountPath: /app/logs
          name: logs-volume
      volumes:
      - name: models-volume
        hostPath:
          path: /run/desktop/mnt/host/c/Projects/fraud_detection/src/models  # WSL 2 host path
          type: Directory
      - name: logs-volume
        hostPath:
          path: /run/desktop/mnt/host/c/Projects/fraud_detection/src/logs
          type: Directory

#### service.yaml

In [None]:
apiVersion: v1
kind: Service
metadata:
  name: fraud-detection-service
spec:
  type: NodePort  # For local access
  selector:
    app: fraud-detection
  ports:
    - protocol: TCP
      port: 80
      targetPort: 8080
      nodePort: 30080  # Fixed port for easy access

#### Step 2: Enable Kubernetes 
1- Open Docker Desktop → Settings → Kubernetes → Enable.

2- Verify:

cmd

kubectl cluster-info

#### Step 3: Deploy to Local Kubernetes

In [None]:
Apply Configurations
kubectl apply -f deployment.yaml
kubectl apply -f service.yaml

In [None]:
Verify Deployement
kubectl get pods -w  # Watch pod status (should transition to "Running")
NAME                                   READY   STATUS    RESTARTS   AGE
fraud-detection-api-5f7d8c6c58-abcde   1/1     Running   0          30s
kubectl get svc      # Check service details
NAME                      TYPE       CLUSTER-IP      PORT(S)        AGE
fraud-detection-service   NodePort   10.96.XXX.XXX   80:30080/TCP   XXs

In [None]:
Make a prediction request
in CMD Window execute
curl -X POST http://localhost:30080/predict ^
-H "Content-Type: application/json" ^
-d "{\"step\":1, \"amount\":1000, \"oldbalanceOrg\":5000, \"newbalanceOrig\":4000, \"oldbalanceDest\":1000, \"newbalanceDest\":2000, \"isFlaggedFraud\":0, \"type\":\"TRANSFER\"}"

In [None]:
Alternative way test fraud detection cases (use poershell as it is more reliable for JSON)
$fraudTest = {
    $body = @{
        step = 1
        amount = 999999
        oldbalanceOrg = 5000
        newbalanceOrig = 0
        oldbalanceDest = 0
        newbalanceDest = 999999
        isFlaggedFraud = 1
        type = "TRANSFER"
    } | ConvertTo-Json
    Invoke-RestMethod -Uri "http://localhost:30080/predict" -Method Post -Body $body -ContentType "application/json"
}

1..5 | ForEach-Object { Start-Job -ScriptBlock $fraudTest } | Wait-Job | Receive-Job

In [None]:
Parallel Testing Command (PowerShell)
# 1. Create a test function
$testRequest = {
    $body = @{
        step = 1
        amount = 1000
        oldbalanceOrg = 5000
        newbalanceOrig = 4000
        oldbalanceDest = 1000
        newbalanceDest = 2000
        isFlaggedFraud = 0
        type = "TRANSFER"
    } | ConvertTo-Json

    try {
        $response = Invoke-RestMethod -Uri "http://localhost:30080/predict" -Method Post -Body $body -ContentType "application/json"
        [PSCustomObject]@{
            Success = $true
            Prediction = $response.fraud_prediction
            Timestamp = $response.meta.timestamp
        }
    } catch {
        [PSCustomObject]@{
            Success = $false
            Error = $_.Exception.Message
        }
    }
}

# 2. Run 10 parallel requests
$jobs = 1..10 | ForEach-Object {
    Start-Job -ScriptBlock $testRequest
}

# 3. Wait for completion and show results
$results = $jobs | Wait-Job | Receive-Job
$jobs | Remove-Job

# 4. Display formatted results
$results | Format-Table -AutoSize

In [None]:
Add Monitoring (In powershell)
# Install Kubernetes metrics server (if not installed)
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml

# Set up dashboard
kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml
kubectl proxy


In [None]:
Access at: 
http://localhost:8001/api/v1/namespaces/kubernetes-dashboard/services/https:kubernetes-dashboard:/proxy/

### Stage 4: Building a Complete CI/CD Pipeline for the Fraud detection API using GitHub Actions, Docker, and Kubernetes

#### Prerequisites
Before we begin, ensure you have:

1- Git installed on your local machine

2- Docker installed

3- Kubernetes (via Docker Desktop) enabled

4- A GitHub account

5- Your code ready

#### Step 1: Initialize Your Local Repository

In [None]:
# Create a new directory for your project (if starting fresh)
mkdir fraud-detection-cicd
cd fraud-detection-cicd

# Initialize git repository
git init

# Create basic directory structure
mkdir -p src/config src/preprocess src/feature_engineer src/train src/predict src/evaluate
mkdir tests deployments

# Create README, sample below 
echo "# Fraud Detection ML API with CI/CD Pipeline" > README.md

#### Step 2: Add Your Existing Files


In [None]:
fraud-detection-cicd/
├── src/
│   ├── config/
│   ├── preprocess/
│   ├── feature_engineer/
│   ├── train/
│   ├── predict/
│   ├── evaluate/
│   └── app.py (your Flask app)
├── tests/
├── deployments/
│   ├── deployment.yaml
│   └── service.yaml
├── Dockerfile
├── requirements.txt
└── README.md

#### Step 3: Create a .gitignore File

In [None]:
# Create .gitignore file
echo "venv/
*.pyc
__pycache__/
*.swp
.env
*.pkl
*.model
.DS_Store
.ipynb_checkpoints/
*.egg-info/
dist/
build/" > .gitignore

#### Step 4: Connect to GitHub Repository

In [None]:
# Stage all files
git add .

# Authorize 

git config --global user.email "your_email@example.com"
git config --global user.name "Your Name"

# Commit initial files
git commit -m "Initial commit with basic structure"

# Verify the remote was added correctly
git remote -v

should show:
origin  https://github.com/moeyahya/your_repository.git (fetch)
origin  https://github.com/moeyahya/your_repository.git (push)

# Then connect your local repository to GitHub (if not connected)
git remote add origin https://github.com/moeyahya/fraud-detection-ml-api-aws-cicd.git

# Push your code
git push -u origin main

#### Step 5: Set Up GitHub Actions for CI/CD
mkdir -p .github/workflows

In [None]:
Create a CI Workflow File
Create a file named .github/workflows/ci.yml with the following content:

In [None]:
name: Continuous Integration

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]

jobs:
  test:
    runs-on: ubuntu-latest
    timeout-minutes: 10

    steps:
    - uses: actions/checkout@v4
    
    - name: Set up Python 3.9
      uses: actions/setup-python@v4
      with:
        python-version: '3.9'
        cache: 'pip'

    - name: Create dummy model directory
      run: |
        mkdir -p models
        echo "Dummy model directory for CI tests" > models/README.md

    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -e .
        pip install pytest pytest-cov

    - name: Run tests
      run: |
        pytest tests/ \
          --cov=src \
          --cov-report=xml \
          --cov-report=term-missing \
          -v
      env:
        PYTHONPATH: ${{ github.workspace }}
        MODEL_PATH: "./models/nonexistent.joblib"

    - name: Upload coverage
      uses: codecov/codecov-action@v3
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        files: coverage.xml
        flags: unittests

In [None]:
Create a CD Workflow File
Create a file named .github/workflows/cd.yml with the following content:

In [None]:
name: Continuous Deployment

on:
  push:
    branches: [ "main" ]
    paths:
      - 'src/**'
      - 'Dockerfile'
      - 'requirements.txt'
      - 'deployments/**'

jobs:
  build-and-deploy:
    runs-on: ubuntu-latest
    environment: production
    
    steps:
    - uses: actions/checkout@v3
    
    - name: Log in to Docker Hub
      uses: docker/login-action@v2
      with:
        username: ${{ secrets.DOCKER_HUB_USERNAME }}
        password: ${{ secrets.DOCKER_HUB_TOKEN }}
    
    - name: Build and push Docker image
      run: |
        docker build -t moeyahya/fraud-detection-api:latest .
        docker push moeyahya/fraud-detection-api:latest
        
    - name: Install kubectl
      uses: azure/setup-kubectl@v3
      
    - name: Deploy to Kubernetes
      run: |
        echo "${{ secrets.KUBE_CONFIG }}" > kubeconfig.yaml
        export KUBECONFIG=kubeconfig.yaml
        
        kubectl apply -f deployments/deployment.yaml
        kubectl apply -f deployments/service.yaml
        
        kubectl rollout status deployment/fraud-detection-deployment
        kubectl get services

#### Step 6: Set Up GitHub Secrets
For the CD pipeline to work, you need to set up secrets in your GitHub repository:

1- Go to your GitHub repository

2- Click on "Settings" > "Secrets and variables" > "Actions"

3- Click "New repository secret"

Add these secrets:

-DOCKER_HUB_USERNAME: Your Docker Hub username

-DOCKER_HUB_TOKEN: Your Docker Hub access token (create in Docker Hub account settings)

-KUBE_CONFIG: Your Kubernetes config file content (get this from ~/.kube/config on your local machine)

##### 🚀 Your CI/CD pipeline should now authenticate with Docker Hub and Kubernetes automatically!

In [None]:
# Add __init__.py Files
Create these empty files to make Python treat directories as packages:

powershell
# In your project root:
New-Item -ItemType File src/__init__.py
New-Item -ItemType File tests/__init__.py

#### Ensure your project looks like this:

In [None]:
fraud-detection-ml-api-aws-cicd/
├── .github/
│   └── workflows/
│       ├── ci.yml
│       └── cd.yml
├── src/
│   ├── __init__.py
│   ├── app.py
│   └── ... (other source files)
├── tests/
│   ├── __init__.py
│   └── test_app.py
├── setup.py
└── requirements.txt

#### Step 7: Add Tests (Optional but Recommended)
test_app.py

In [None]:
import unittest
from src.app import app

class TestAPI(unittest.TestCase):
    def setUp(self):
        app.config['TESTING'] = True
        self.client = app.test_client()
    
    def test_health_check(self):
        response = self.client.get('/health')
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.json['status'], 'healthy')
        self.assertIn('model_loaded', response.json)

    def test_predict_endpoint(self):
        test_data = {
            "amount": 100,
            "oldbalanceOrg": 1000,
            "newbalanceOrig": 900,
            "oldbalanceDest": 500,
            "newbalanceDest": 600,
            "step": 1,
            "isFlaggedFraud": 0,
            "type": "TRANSFER"
        }
        response = self.client.post('/predict', json=test_data)
        self.assertEqual(response.status_code, 200)
        self.assertIn('fraud_prediction', response.json)
        self.assertEqual(response.json['model_info']['test_mode'], True)

if __name__ == '__main__':
    unittest.main()

#### Create setup.py (Recommended)
Create this file in your project root:

In [None]:
from setuptools import setup, find_packages

setup(
    name="fraud-detection",
    version="0.1",
    packages=find_packages(include=['src*']), 
    install_requires=[
        'pandas>=1.5.0',
        'scikit-learn>=1.2.0',
        'Flask>=2.0.0',
        'joblib>=1.0.0',
        'imbalanced-learn>=0.10.0',
        'scipy>=1.7.0',
        'numpy>=1.21.0',
        'waitress>=2.1.0'      ## from your requirements file
    ],
)

In [None]:
Correct any imports in your files accordingly 
for example
# In src/app.py:
from src.predict import FraudPredictor

# In src/predict.py:
from src.app import some_helper_function

##### for example src/predict.py and src/app.py need to be modified with correct source imports

In [None]:
#predict.py
from joblib import load
import pandas as pd
from src.config import MODEL_PATH, AppConfig, AMOUNT_PERCENTILE, BALANCE_PERCENTILE 
import json
from datetime import datetime
import logging
import os

class FraudPredictor:
    def __init__(self, test_mode=False):
        self.test_mode = test_mode
        self.model = None
        self.pt = None
        self.feature_order = []
        self._init_logging()
        if not test_mode:
            self._load_model()
        
    def _load_model(self):
        """Load model artifacts with exact feature validation"""
        try:
            if not os.path.exists(MODEL_PATH):
                raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")
                
            artifacts = load(MODEL_PATH)
            self.model = artifacts['model']
            self.pt = artifacts['transformer']
            
            self.feature_order = [
                'step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
                'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud',
                'amount_to_balance', 'high_amount_flag', 'balance_change_abs',
                'suspicious_withdrawal', 'hour_of_day', 'day_of_week', 'is_weekend',
                'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER'
            ]
            print("✅ Model loaded with features:", self.feature_order)
        except Exception as e:
            print(f"⚠️ Model loading failed: {str(e)}")
            if not self.test_mode:
                raise RuntimeError(f"Model loading failed: {str(e)}")

    def _init_logging(self):
        """Set up prediction logging"""
        logging.basicConfig(
            filename=AppConfig.PREDICTION_LOGS,
            format='%(asctime)s - %(message)s',
            level=logging.INFO
        )
        self.logger = logging.getLogger(__name__)

    def _validate_input(self, data: dict) -> None:
        """Ensure minimum required fields exist"""
        required_fields = {
            'amount', 'oldbalanceOrg', 'newbalanceOrig',
            'oldbalanceDest', 'newbalanceDest', 'step',
            'isFlaggedFraud', 'type'
        }
        missing = required_fields - set(data.keys())
        if missing:
            raise ValueError(f"Missing required fields: {missing}")

    def log_prediction(self, data: dict, prediction: int):
        """Log prediction with context"""
        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "input": {k: v for k, v in data.items() if k != 'type'},
            "prediction": prediction,
            "model_version": "1.0.0" 
        }
        self.logger.info(json.dumps(log_entry))

    def preprocess(self, transaction_data: dict):
        """Recreate features EXACTLY as during training"""
        df = pd.DataFrame([transaction_data])
        
        # Feature engineering
        df['amount_to_balance'] = df['amount'] / (df['oldbalanceOrg'] + 1)
        df['high_amount_flag'] = (df['amount'] > 10000).astype(int)
        df['balance_change_abs'] = df['oldbalanceOrg'] - df['newbalanceOrig']
        df['suspicious_withdrawal'] = (
            (df['balance_change_abs'] > 5000) & 
            (df['amount_to_balance'] > 0.5)
        ).astype(int)
        
        # Time features
        df['hour_of_day'] = ((df['step'] - 1) % 24) + 1
        df['day_of_week'] = ((df['step'] - 1) // 24) % 7
        df['is_weekend'] = ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)
        
        # Transaction type handling
        valid_types = ['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER']
        for t in valid_types:
            df[f'type_{t}'] = 0
        if 'type' in df and df['type'].iloc[0] in valid_types:
            df[f'type_{df["type"].iloc[0]}'] = 1
            
        # Verify feature match
        missing = set(self.feature_order) - set(df.columns)
        if missing:
            raise ValueError(f"Missing features after processing: {missing}")
            
        return self.pt.transform(df[self.feature_order])

    def predict(self, transaction_data: dict) -> int:
        if self.test_mode:
            return 0  # Dummy prediction in test mode
            
        try:
            self._validate_input(transaction_data)
            processed = self.preprocess(transaction_data)
            prediction = int(self.model.predict(processed)[0])
            self.log_prediction(transaction_data, prediction)
            return prediction
        except Exception as e:
            self.logger.error(f"Prediction failed: {str(e)}")
            raise RuntimeError(f"Prediction failed: {str(e)}")

In [None]:
# src/app.py
from flask import Flask, request, jsonify
from src.predict import FraudPredictor
from datetime import datetime
from src.config import AppConfig
import os

app = Flask(__name__)

# Initialize predictor with test mode if MODEL_PATH doesn't exist
try:
    predictor = FraudPredictor(test_mode=not os.path.exists('models/fraud_model.joblib'))
    print("ℹ️ Predictor initialized in test mode" if predictor.test_mode else "✅ Predictor initialized with model")
except Exception as e:
    print(f"❌ Failed to initialize predictor: {str(e)}")
    raise

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json()
        
        if not data:
            return jsonify({"error": "No JSON provided"}), 400
            
        request_meta = {
            "timestamp": datetime.now().isoformat(),
            "endpoint": "predict",
            "client_ip": request.remote_addr
        }
        
        prediction = predictor.predict(data)
        
        return jsonify({
            "fraud_prediction": prediction,
            "meta": request_meta,
            "model_info": {
                "version": "1.0.0",
                "type": "RandomForest",
                "test_mode": predictor.test_mode
            },
            "status": "success"
        })
        
    except ValueError as e:
        return jsonify({"error": str(e), "status": "input_error"}), 400
    except Exception as e:
        return jsonify({"error": str(e), "status": "server_error"}), 500

@app.route('/health', methods=['GET'])
def health():
    return jsonify({
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "model_loaded": not predictor.test_mode
    })

if __name__ == '__main__':
    app.run(host=AppConfig.HOST, port=AppConfig.PORT, debug=AppConfig.DEBUG)

#### Step 8: Commit and Push Changes

In [None]:
git add .
git commit -m "Add CI/CD workflows and basic tests"
git push origin main

#### Step 9: Monitor the Workflows
1- Go to your GitHub repository

2- Click on the "Actions" tab

3- You should see your workflows running

4- Click on each workflow to see detailed logs

#### Step 10: Verify Deployment
Once the CD pipeline completes:

In [None]:
1- Check your Kubernetes cluster:

powershell
kubectl get pods
kubectl get services

PS C:\Projects\fraud-detection-cicd> kubectl get pods
NAME                                   READY   STATUS    RESTARTS   AGE
fraud-detection-api-7875d449bb-xl6tz   1/1     Running   0          2d3h
test-api-5f8664f4cb-9nqd7              1/1     Running   0          2d6h

PS C:\Projects\fraud-detection-cicd> kubectl get services
NAME                      TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)        AGE
fraud-detection-service   NodePort    10.102.207.162   <none>        80:30080/TCP   3d
kubernetes                ClusterIP   10.96.0.1        <none>        443/TCP        3d

In [None]:
2- Find the external IP (if using LoadBalancer) or port-forward to access your service:

powershell
kubectl port-forward service/fraud-detection-service 5000:5000

In [None]:
3- Test the API endpoint:

powershell
curl http://localhost:5000/health

CI Pipeline:

✅ Runs on every push

✅ Creates sample data

✅ Runs tests

✅ Uploads coverage

CD Pipeline:

✅ Builds Docker image only when relevant paths change (src/, Dockerfile, etc.)

✅ Pushes to Docker Hub

✅ Conditionally deploys to Kubernetes (if KUBE_CONFIG exists)

#### Step 4: Access the Service

#### 2. On-Prem Kubernetes Deployment (Staging/Prod)

#### Push Image to Private Registry 

In [None]:
Authenticate with AWS ECR
aws ecr get-login-password | docker login --username AWS --password-stdin YOUR_ID_HERE.dkr.ecr.ca-central-1.amazonaws.com

In [None]:
Tag Image
docker tag fraud-detection-api:latest YOUR_ID_HERE.dkr.ecr.ca-central-1.amazonaws.com/fraud-detection-api:latest

In [None]:
Push Image
docker push YOUR_ID_HERE.dkr.ecr.ca-central-1.amazonaws.com/fraud-detection-api:latest

In [None]:
Verify the push was successful
aws ecr list-images --repository-name fraud-detection-api --region ca-central-1

## Detailed Plan: Deploying Fraud Detection Model on AWS with CI/CD Pipelines

### 📌 Phase 1: AWS Free Tier Setup

### Step 1: Create an AWS Account & Set Up CLI
##### 1- Sign up for AWS Free Tier (aws.amazon.com/free) . make sure to save your access key ID and Secret access key as they will be used when configuring aws

##### * Avoid services that aren’t free (check pricing before deploying).

##### 2- Install AWS CLI (AWS CLI Install Guide)

##### * Verify installation:
##### aws --version
##### 3- Configure AWS CLI
##### aws configure
##### Enter your Access Key ID, Secret Access Key, Default Region (us-east-1), and Output Format (json).


### Step 2: Set Up IAM (Identity & Access Management)

#### 1- Go to IAM Console → Users → Add User

##### Name: fraud-detection-deployer

#### Under Permissions in the search bar add:

##### AmazonEC2ContainerRegistryFullAccess

##### AmazonECS_FullAccess (if using ECS)

##### AWSCodePipeline_FullAccess

##### AWSCodeBuildAdminAccess

##### AWSCloudFormationFullAccess (optional, for Infrastructure as Code)
#### 2- Generate Access Keys (for CI/CD later)

### 📌 Phase 2: Docker & AWS ECR (Elastic Container Registry)

#### Step 1: Push Docker Image to AWS ECR

In [None]:
1- Login to AWS ECR
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin YOUR_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com

To get your AWS account ID and other identity info 
aws sts get-caller-identity


In [None]:
2- Create a Repository
aws ecr create-repository --repository-name fraud-detection --region us-east-1 (you can add whatever region you are working from)

In [None]:
3- Tag & Push Docker Image
docker tag fraud-detection:latest YOUR_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com/fraud-detection:latest
docker push YOUR_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com/fraud-detection:latest

On your AWS Console under fraud-detection-deployer (user created earlier) check for image to ensure image is attached 
or you can execute 
aws ec2 describe-images --owners self


In [None]:
4- Push to AWS ECR:
docker push 311410995726.dkr.ecr.us-east-1.amazonaws.com/fraud-detection:latest

### 📌 Phase 3: Deploying the Model

#### Option A: Deploying with AWS ECS (Elastic Container Service)
ECS is a container orchestration service that runs Docker containers. We’ll use Fargate (serverless) to avoid managing servers.

#### Step 1: Create a Task Definition (JSON Config for Container)
A Task Definition tells ECS how to run your Docker container.

#### 1- Go to AWS ECS Console:

 - Open AWS ECS Console

 - Click Task Definitions → Create new Task Definition

#### 2- Select Launch Type:

 - Choose Fargate (serverless) → Click Next Step

#### 3- Configure Task Definition:

 - Task Definition Name: fraud-detection-task

 - Task Role: None (for now, unless you need AWS permissions)

 - Network Mode: awsvpc (required for Fargate)

 - Task Execution Role:

If none exists, click Create new role (AWS will auto-generate one).

#### 4- Set Task Size (Free Tier Limits):

 - CPU: 0.25 vCPU

 - Memory: 0.5 GB

#### 5- Add Container:

 - Click Add Container

 - Container Name: fraud-detection-container

 - Image: Paste your ECR image URI (e.g., 311410995726.dkr.ecr.us-east-1.amazonaws.com/fraud-detection:latest)

Port Mappings:

 - If your app runs on port 5000 (Flask default), add:

   Container Port: 5000

   Protocol: TCP

#### Click Add → Create

### Step 2: Create an ECS Cluster (Fargate, Serverless)
A Cluster is a logical group of tasks/services.

Go to ECS Console → Clusters → Create Cluster

Select Template:

Choose "Networking only (Fargate)" → Click Next Step

Configure Cluster:

Cluster Name: fraud-detection-cluster

Leave VPC & Subnets as default (AWS picks for you)

Click Create

#### Step 3: Create an ECS Service (Runs Your Task)
A Service ensures your task keeps running (auto-restarts if crashes).

Inside your Cluster → Click Create Service

Configure Service:

Launch Type: FARGATE

Task Definition: Select fraud-detection-task (latest revision)

Cluster: fraud-detection-cluster

Service Name: fraud-detection-service

Number of Tasks: 1 (Free Tier allows only 1)

Networking:

VPC & Subnets: Default is fine

Security Group:

Create a new one (e.g., fraud-detection-sg)

Add rule: Allow TCP Port 5000 (or your app’s port)

Load Balancer (Optional):

If you want an API, attach an Application Load Balancer (ALB)

Click Create Service

### Option B: Deploying with AWS Lambda (Serverless)
Lambda runs code without managing servers. Good for APIs, but has size limits.

#### Step 1: Package Model for Lambda
Lambda has a 250MB deployment package limit. If your model is large:

- Use Lambda Layers (stores dependencies separately).

- Or trim unnecessary files (e.g., remove unused libraries).

1- Zip Your Lambda Code:

- Your Python script (e.g., lambda_function.py)

- requirements.txt (if using extra libraries)

Example structure:

fraud-detection-lambda/  
├── lambda_function.py  
├── requirements.txt  

Zip it:

cmd
cd fraud-detection-lambda
zip -r lambda_package.zip .

2- Upload to S3 (Optional, if package is large):

Go to AWS S3 Console

Create a bucket → Upload lambda_package.zip

#### Step 2: Create Lambda Function
1- Go to AWS Lambda Console → Create Function

2- Configure Function:

Function Name: fraud-detection-lambda

Runtime: Python 3.9

Architecture: x86_64

Permissions:

Create a new role with basic Lambda permissions.

3- Upload Code:

Upload .zip file (or paste code inline if small).

4- Set Memory & Timeout:

Memory: 512MB

Timeout: 1 min

#### Step 3: Set Up API Gateway (REST API for /predict)
1- Go to API Gateway Console → Create API → REST API

2- Configure API:

- API Name: fraud-detection-api

- Endpoint Type: Regional

3- Create Resource & Method:

- Click Actions → Create Resource → Name: predict

- Click Actions → Create Method → POST

4- Integrate with Lambda:

Select your fraud-detection-lambda function.

5- Deploy API:

Click Actions → Deploy API

Stage Name: prod

✅ API is live! Test it at the provided URL (e.g., https://xxxx.execute-api.us-east-1.amazonaws.com/prod/predict).