In [133]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
import numpy as np


In [156]:
df=pd.read_csv('data.csv')

In [158]:
df=df.drop_duplicates()
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
# Create new columns
df['Transaction Hour'] = df['TransactionStartTime'].dt.hour
df['Transaction Day'] = df['TransactionStartTime'].dt.day
df['Transaction Month'] = df['TransactionStartTime'].dt.month
df['Transaction Year'] = df['TransactionStartTime'].dt.year

In [159]:
df['CountryCode'] = df['CountryCode'].astype(str)  # Ensure CountryCode is treated as a string

In [None]:
categorical_df = df.select_dtypes(include=['object', 'category'])
categorical_df

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId'],
      dtype='object')

In [161]:
numerical_df = df.select_dtypes(include=['number'])
numerical_df.columns

Index(['Amount', 'Value', 'PricingStrategy', 'FraudResult', 'Transaction Hour',
       'Transaction Day', 'Transaction Month', 'Transaction Year'],
      dtype='object')

In [162]:
# Define the columns for numerical and categorical features
numerical_features = ['Amount', 'Value', 'PricingStrategy', 'FraudResult', 'Transaction Hour','Transaction Day', 'Transaction Month', 'Transaction Year']
categorical_features = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId','CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId','ProductCategory', 'ChannelId']

# Create the preprocessing pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', MinMaxScaler())                    # Normalize
])

# Create the preprocessing pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))  # Impute missing values with mode
])

# Combine both pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Fit and transform the data
df_transformed = preprocessor.fit_transform(df)

# Convert the transformed data back to a DataFrame
df_transformed = pd.DataFrame(df_transformed, columns=numerical_features + categorical_features)

In [163]:
df_transformed

Unnamed: 0,Amount,Value,PricingStrategy,FraudResult,Transaction Hour,Transaction Day,Transaction Month,Transaction Year,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId
0,0.092004,0.000101,0.5,0.0,0.086957,0.466667,0.909091,0.0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3
1,0.09191,0.000002,0.5,0.0,0.086957,0.466667,0.909091,0.0,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2
2,0.091958,0.00005,0.5,0.0,0.086957,0.466667,0.909091,0.0,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3
3,0.09375,0.002206,0.5,0.0,0.130435,0.466667,0.909091,0.0,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3
4,0.091853,0.000065,0.5,0.0,0.130435,0.466667,0.909091,0.0,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,0.09182,0.000101,0.5,0.0,0.391304,0.4,0.090909,1.0,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2
95658,0.092004,0.000101,0.5,0.0,0.391304,0.4,0.090909,1.0,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3
95659,0.09191,0.000002,0.5,0.0,0.391304,0.4,0.090909,1.0,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2
95660,0.092188,0.000303,0.5,0.0,0.434783,0.4,0.090909,1.0,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,ChannelId_3


In [164]:
le = LabelEncoder()
# Fit and transform the specified columns
columns_to_encode = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId','CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId','ProductCategory', 'ChannelId']
# Apply label encoding to each column individually
for column in columns_to_encode:
    df_transformed[column] = le.fit_transform(df_transformed[column])

In [165]:
# Normalization
min_max_scaler = MinMaxScaler()
df_normalized = pd.DataFrame(min_max_scaler.fit_transform(df_transformed), columns=df_transformed.columns)

In [167]:
df_normalized.to_csv('Final_preprocessed_data')

In [None]:
import pandas as pd
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# --- 1. Generate Sample Transaction Data ---
# In a real scenario, you would load your transaction data here.
df_transactions = pd.read_csv('Final_preprocessed_data.csv')

np.random.seed(42) # for reproducibility of synthetic data

num_customers = 500
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 12, 31)

data = []
for customer_id in range(1, num_customers + 1):
    # Simulate varying transaction frequencies and amounts
    num_transactions = np.random.randint(1, 30) # Customers can have 1 to 30 transactions
    for _ in range(num_transactions):
        random_days = np.random.randint(0, (end_date - start_date).days)
        transaction_date = start_date + timedelta(days=random_days)
        amount = np.random.uniform(10, 1000) # Transaction amounts between 10 and 1000
        data.append({'CustomerId': customer_id, 'TransactionDate': transaction_date, 'Amount': amount})

df_transactions = pd.DataFrame(data)

print("--- Sample Transaction Data ---")
print(df_transactions.head())
print(f"\nTotal transactions: {len(df_transactions)}")
print(f"Unique customers: {df_transactions['CustomerId'].nunique()}")

# --- 2. Calculate RFM Metrics ---

# Define a snapshot date: The day after the latest transaction in the dataset
snapshot_date = df_transactions['TransactionDate'].max() + timedelta(days=1)
print(f"\nSnapshot Date for Recency calculation: {snapshot_date}")

# Calculate RFM for each customer
rfm_df = df_transactions.groupby('CustomerId').agg(
    Recency=('TransactionDate', lambda date: (snapshot_date - date.max()).days),
    Frequency=('TransactionDate', 'count'),
    Monetary=('Amount', 'sum')
).reset_index()

print("\n--- Calculated RFM Metrics (Raw) ---")
print(rfm_df.head())
print(f"\nRFM DataFrame shape: {rfm_df.shape}")


# Initialize StandardScaler
scaler = StandardScaler()

# Select RFM features for scaling
rfm_features = rfm_df[['Recency', 'Frequency', 'Monetary']]

# Scale the features
scaled_rfm_features = scaler.fit_transform(rfm_features)
scaled_rfm_df = pd.DataFrame(scaled_rfm_features, columns=rfm_features.columns, index=rfm_df.index)

print("\n--- Scaled RFM Features ---")
print(scaled_rfm_df.head())

# --- 4. Cluster Customers using K-Means ---

# Set random_state for reproducibility
random_state = 42
kmeans = KMeans(n_clusters=3, random_state=random_state, n_init=10) # n_init for robust centroid initialization

# Fit K-Means to the scaled data
kmeans.fit(scaled_rfm_df)

# Add cluster labels to the RFM DataFrame
rfm_df['Cluster'] = kmeans.labels_

print("\n--- RFM Data with Cluster Labels ---")
print(rfm_df.head())
print(f"\nCluster distribution:\n{rfm_df['Cluster'].value_counts()}")

cluster_analysis = rfm_df.groupby('Cluster').agg(
    AvgRecency=('Recency', 'mean'),
    AvgFrequency=('Frequency', 'mean'),
    AvgMonetary=('Monetary', 'mean'),
    Count=('CustomerId', 'count')
).sort_values(by=['AvgRecency', 'AvgFrequency', 'AvgMonetary'], ascending=[False, True, True]) # Sort to find high-risk

print("\n--- Cluster Analysis (Mean RFM Values) ---")
print(cluster_analysis)
high_risk_cluster_id = cluster_analysis.index[0]
print(f"\nIdentified High-Risk Cluster ID: {high_risk_cluster_id}")

# Create the new binary target column 'is_high_risk'
rfm_df['is_high_risk'] = rfm_df['Cluster'].apply(lambda x: 1 if x == high_risk_cluster_id else 0)

print("\n--- RFM Data with 'is_high_risk' Label ---")
print(rfm_df.head())
print(f"\nHigh-risk customer count: {rfm_df['is_high_risk'].sum()}")

# --- 6. Integrate the Target Variable ---

main_data = []
for customer_id in range(1, num_customers + 1):
    age = np.random.randint(20, 70)
    income = np.random.uniform(30000, 100000)
    main_data.append({'CustomerId': customer_id, 'Age': age, 'Income': income})

df_main = pd.DataFrame(main_data)

print("\n--- Sample Main Processed Dataset (Before Merge) ---")
print(df_main.head())
print(f"\nMain dataset shape: {df_main.shape}")

# Merge the 'is_high_risk' column back into the main processed dataset
# We only need CustomerId and is_high_risk from rfm_df
df_main = pd.merge(df_main, rfm_df[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')

print("\n--- Main Processed Dataset (After Merge with 'is_high_risk') ---")
print(df_main.head())
print(f"\nMain dataset shape after merge: {df_main.shape}")

# Verify that all customers from the main dataset have an 'is_high_risk' label
print(f"\nMissing 'is_high_risk' values: {df_main['is_high_risk'].isnull().sum()}")

# Final check: distribution of the new target variable
print(f"\nDistribution of 'is_high_risk':\n{df_main['is_high_risk'].value_counts()}")


--- Sample Transaction Data ---
   CustomerId TransactionDate      Amount
0           1      2024-03-11  951.207163
1           1      2023-04-17  781.894090
2           1      2023-01-21  164.458454
3           1      2024-04-11  108.975167
4           1      2024-04-03  867.514384

Total transactions: 7537
Unique customers: 500

Snapshot Date for Recency calculation: 2024-12-31 00:00:00

--- Calculated RFM Metrics (Raw) ---
   CustomerId  Recency  Frequency      Monetary
0           1      264          7   3055.866058
1           2      345          2   1024.124932
2           3       84         21   9982.865502
3           4        4         26  13820.978961
4           5       32         14   7128.400071

RFM DataFrame shape: (500, 4)

--- Scaled RFM Features ---
    Recency  Frequency  Monetary
0  1.920143  -0.982761 -1.058853
1  2.724721  -1.591358 -1.533300
2  0.132189   0.721308  0.558724
3 -0.662457   1.329905  1.454991
4 -0.384330  -0.130726 -0.107844

--- RFM Data with Clust

In [19]:
df_main.to_csv('Final_processed_data.csv', index=False)

In [16]:
df_main

Unnamed: 0,CustomerId,Age,Income,is_high_risk
0,1,21,41488.010772,1
1,2,59,98417.226025,1
2,3,48,62316.142653,0
3,4,23,60005.348855,0
4,5,39,44060.951498,0
...,...,...,...,...
495,496,37,38601.290593,0
496,497,63,36839.287258,0
497,498,69,73301.787305,0
498,499,48,79133.763135,0


In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
import pandas as pd
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn

X = df_main.drop(columns=['is_high_risk'])
y = df_main['is_high_risk']

    # Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {X_train.shape}, Testing data shape: {X_test.shape}")
print(f"Distribution of target in training set:\n{y_train.value_counts(normalize=True)}")
print(f"Distribution of target in testing set:\n{y_test.value_counts(normalize=True)}")


# Function to train, evaluate, and log model with MLflow
def train_evaluate_log_model(model, model_name, X_train, y_train, X_test, y_test, params=None):
    with mlflow.start_run(run_name=model_name) as run:
        # Log parameters
        mlflow.log_param("model_name", model_name)
        if params:
            mlflow.log_params(params)
        else:
            mlflow.log_params(model.get_params())

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] # Probability for the positive class

        # Evaluate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        roc_auc = roc_auc_score(y_test, y_proba)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)

        print(f"\n--- {model_name} Metrics ---")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"ROC AUC: {roc_auc:.4f}")

        # Log the model
        mlflow.sklearn.log_model(model, "model")
        print(f"Model '{model_name}' logged to MLflow.")
        return roc_auc, run.info.run_id # Return ROC_AUC and run_id for best model selection


# --- Choose and Train Models ---

best_roc_auc = -1
best_model_name = ""
best_run_id = ""

# Model 1: Logistic Regression (with Hyperparameter Tuning)
print("\n--- Training Logistic Regression with GridSearchCV ---")
log_reg = LogisticRegression(random_state=random_state, solver='liblinear')
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
grid_search_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

best_lr_model = grid_search_lr.best_estimator_
print(f"Best Logistic Regression parameters: {grid_search_lr.best_params_}")
current_roc_auc, current_run_id = train_evaluate_log_model(best_lr_model, "Logistic Regression (Tuned)", X_train, y_train, X_test, y_test, params=grid_search_lr.best_params_)

if current_roc_auc > best_roc_auc:
    best_roc_auc = current_roc_auc
    best_model_name = "Logistic Regression (Tuned)"
    best_run_id = current_run_id


# Model 2: Random Forest Classifier
print("\n--- Training Random Forest Classifier ---")
rf_model = RandomForestClassifier(random_state=random_state, n_estimators=100)
current_roc_auc, current_run_id = train_evaluate_log_model(rf_model, "Random Forest", X_train, y_train, X_test, y_test)

if current_roc_auc > best_roc_auc:
    best_roc_auc = current_roc_auc
    best_model_name = "Random Forest"
    best_run_id = current_run_id

print(f"\n--- Best Model Identified ---")
print(f"Best Model: {best_model_name}")
print(f"Best ROC AUC: {best_roc_auc:.4f}")
print(f"Best Run ID: {best_run_id}")

# --- Register the Best Model in MLflow Model Registry ---
if best_run_id:
    print(f"\nRegistering '{best_model_name}' (Run ID: {best_run_id}) to MLflow Model Registry...")
    model_uri = f"runs:/{best_run_id}/model"
    registered_model = mlflow.register_model(model_uri=model_uri, name="CreditRiskHighRiskModel")
    print(f"Model registered as: {registered_model.name} (Version: {registered_model.version})")
else:
    print("\nNo best model identified or run ID available for registration.")

print("\n--- Model Training, Tracking, and Registration Complete ---")
print("You can view the MLflow runs and registered models by starting the MLflow UI in your environment.")




Training data shape: (400, 3), Testing data shape: (100, 3)
Distribution of target in training set:
is_high_risk
0    0.9175
1    0.0825
Name: proportion, dtype: float64
Distribution of target in testing set:
is_high_risk
0    0.87
1    0.13
Name: proportion, dtype: float64

--- Training Logistic Regression with GridSearchCV ---
Best Logistic Regression parameters: {'C': 0.1, 'penalty': 'l1'}

--- Logistic Regression (Tuned) Metrics ---
Accuracy: 0.8700
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
ROC AUC: 0.5986




Model 'Logistic Regression (Tuned)' logged to MLflow.

--- Training Random Forest Classifier ---

--- Random Forest Metrics ---
Accuracy: 0.8600
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
ROC AUC: 0.5588


Registered model 'CreditRiskHighRiskModel' already exists. Creating a new version of this model...


Model 'Random Forest' logged to MLflow.

--- Best Model Identified ---
Best Model: Logistic Regression (Tuned)
Best ROC AUC: 0.5986
Best Run ID: 28a585f9befa4cca96ec267de74595f1

Registering 'Logistic Regression (Tuned)' (Run ID: 28a585f9befa4cca96ec267de74595f1) to MLflow Model Registry...
Model registered as: CreditRiskHighRiskModel (Version: 2)

--- Model Training, Tracking, and Registration Complete ---
You can view the MLflow runs and registered models by starting the MLflow UI in your environment.


Created version '2' of model 'CreditRiskHighRiskModel'.
