In [53]:
!pip install aequitas



In [94]:
import lightgbm as lgbm  # Tested ML method
import numpy as np       # Random number generation
import seaborn as sns    # Plotting library
import pandas as pd      # Read/write data
import yaml              # Read hyperparameter space configuration

from aequitas.group import Group                # Fairness metrics
from matplotlib import pyplot as plt            # Plotting method
from sklearn.preprocessing import LabelEncoder  # Categorical encoding for LGBM models
from sklearn import metrics                     # ROC metrics

from random_search import RandomValueTrial, suggest_callable_hyperparams  # Random search wrapper methods


In [95]:
import json
import requests

In [96]:
# Read hyperparameter space for the LGBM Models, expected structure is presented bellow
with open("lightgbm_hyperparameter_space.yaml", "r") as file:
    hyperparam_space = yaml.load(file, Loader=yaml.FullLoader)

In [97]:
# The expected structure is the following:
hyperparam_space

{'LightGBM': {'classpath': 'lightgbm.LGBMClassifier',
  'kwargs': {'n_estimators': {'type': 'int',
    'range': [20, 10000],
    'log': True},
   'max_depth': {'type': 'int', 'range': [3, 30]},
   'learning_rate': {'type': 'float', 'range': [0.02, 0.1], 'log': True},
   'num_leaves': {'type': 'int', 'range': [10, 100], 'log': True},
   'boosting_type': ['gbdt', 'goss'],
   'min_data_in_leaf': {'type': 'int', 'range': [5, 200], 'log': True},
   'max_bin': {'type': 'int', 'range': [100, 500]},
   'enable_bundle': [True, False]}}}

In [98]:
# Testing a random search suggestion:
trial = RandomValueTrial(seed=1)
suggest_callable_hyperparams(trial, hyperparam_space)

{'classpath': 'lightgbm.LGBMClassifier',
 'n_estimators': 263,
 'max_depth': 23,
 'learning_rate': 0.020003681922217444,
 'num_leaves': 19,
 'boosting_type': 'gbdt',
 'min_data_in_leaf': 9,
 'max_bin': 238,
 'enable_bundle': False}

In [99]:
# Define path to datasets. Replace `base_path` with the appropriate value.
base_path = "/content/Base.csv.zip"

datasets_paths = {
   "dataset1": "/content/Base.csv.zip"

}

In [100]:
# Read the datasets with pandas.
datasets = {
    key: pd.read_csv(path, compression='zip') for key, path in datasets_paths.items()
}

In [101]:
# Define the label field and categorical columns.
label = "fraud_bool"

categorical_features = [
    "payment_type",
    "employment_status",
    "housing_status",
    "source",
    "device_os",
]

In [102]:
# Create the train and test sets. Shuffle data with `sample` method.
# The split was done by month. The first 6 months as the train, the last 2 months as test.
train_dfs = {key: df[df["month"]<6].sample(frac=1, replace=False) for key, df in datasets.items()}
test_dfs = {key: df[df["month"]>=6].sample(frac=1, replace=False) for key, df in datasets.items()}

In [103]:
# Encode the categorical variables in the datasets to integers.
# This is expected by LGBM (or columns with the `categorical` data type).

for name in datasets.keys():  # For each dataset in the suite
    train = train_dfs[name]
    test = test_dfs[name]

    for feat in categorical_features:
        encoder = LabelEncoder()
        encoder.fit(train[feat])  # Fit an encoder to the train set.
        train[feat] = encoder.transform(train[feat])  # Transform train set.
        test[feat] = encoder.transform(test[feat])    # Transform test set.

In [104]:
def getULR():
    lambda_url = "https://zhn5wtr2x7dftdop4btjvyt3ke0hrwmq.lambda-url.us-east-2.on.aws/"

# Data you want to send in the body of the request

# Send the HTTP POST request with the customer_id in the body
    response = requests.post(lambda_url)

# Check the response status
    if response.status_code == 200:
        print("Lambda function invoked successfully.")
        #print("Response body:", response.json())  # Assuming the Lambda function returns a JSON response
        return response.json()
    else:
        print(f"Failed to invoke Lambda function. Status code: {response.status_code}")
        print("Error:", response.text)





In [105]:
def getData(key):
    lambda_url = "https://hextfv6ffcaf3biodpkgjkvgs40cqcgg.lambda-url.us-east-2.on.aws/"

    payload = {
        "customer_id": key
    }

# Send the HTTP POST request with the customer_id in the body
    response = requests.post(lambda_url, json=payload)

# Check the response status
    if response.status_code == 200:
        print("Lambda function invoked successfully.")
        print("Response body:", response.json())  # Assuming the Lambda function returns a JSON response
        return response.json()
    else:
        print(f"Failed to invoke Lambda function. Status code: {response.status_code}")
        print("Error:", response.text)

In [112]:
for _ in range(1, 101):
  print(getData(_))

Lambda function invoked successfully.
Response body: {'customer_id': 1, 'income': 0.13609602371552887, 'name_email_similarity': 0.06521789892162266, 'prev_address_months_count': 251, 'current_address_months_count': 189, 'customer_age': 73, 'days_since_request': 70.46871325044738, 'intended_balcon_amount': 13.384349380039769, 'payment_type': 'AC', 'zip_count_4w': 2384, 'velocity_6h': 5997.551161588079, 'velocity_24h': 3652.283831838784, 'velocity_4w': 3549.502038425567, 'bank_branch_count_8w': 1917, 'date_of_birth_distinct_emails_4w': 14, 'employment_status': 'CF', 'credit_risk_score': -73, 'email_is_free': 0, 'housing_status': 'BE', 'phone_home_valid': 1, 'phone_mobile_valid': 0, 'bank_months_count': 3, 'has_other_cards': 1, 'proposed_credit_limit': 552.5903349917111, 'foreign_request': 0, 'source': 'TELEAPP', 'session_length_in_minutes': 26.57441849086488, 'device_os': 'other', 'keep_alive_session': 1, 'device_distinct_emails_8w': -1, 'device_fraud_count': 0, 'month': 4, 'fraud_bool':

In [106]:
ulr_json = getULR()
ulr_keys = ulr_json['customer_ids']
ulr_list = {}
print(getData(10))
for key in ulr_keys:
    ulr_list[key] = getData(key)

Lambda function invoked successfully.
Lambda function invoked successfully.
Response body: {'customer_id': 10, 'income': 0.7071956167502245, 'name_email_similarity': 0.44949386791779095, 'prev_address_months_count': 334, 'current_address_months_count': 417, 'customer_age': 85, 'days_since_request': 58.933000813853994, 'intended_balcon_amount': 4.589770143408117, 'payment_type': 'AC', 'zip_count_4w': 3513, 'velocity_6h': 15249.946132255604, 'velocity_24h': 4350.335893195264, 'velocity_4w': 5916.139398584218, 'bank_branch_count_8w': 1119, 'date_of_birth_distinct_emails_4w': 30, 'employment_status': 'CE', 'credit_risk_score': 104, 'email_is_free': 1, 'housing_status': 'BA', 'phone_home_valid': 0, 'phone_mobile_valid': 0, 'bank_months_count': 20, 'has_other_cards': 0, 'proposed_credit_limit': 1020.9478977964523, 'foreign_request': 0, 'source': 'INTERNET', 'session_length_in_minutes': 29.29616860946916, 'device_os': 'macintosh', 'keep_alive_session': 1, 'device_distinct_emails_8w': 0, 'devi

In [107]:
from sklearn.utils import resample

# Remove low-variance and constant features
low_variance_features = [col for col in X_train.columns if X_train[col].std() < 0.01]
constant_features = [col for col in X_train.columns if X_train[col].nunique() <= 1]


# Balance the dataset
train_data = pd.concat([X_train, y_train], axis=1)
minority = train_data[train_data[label] == 1]
majority = train_data[train_data[label] == 0]
minority_oversampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
balanced_train = pd.concat([majority, minority_oversampled])
X_train = balanced_train.drop(columns=[label])
y_train = balanced_train[label]

# Update hyperparameters
test_hyperparams.update({
    "num_leaves": 63,
    "min_data_in_leaf": 5,
    "max_depth": -1
})

# Remove conflicting parameters from test_hyperparams
test_hyperparams.pop("n_estimators", None)  # Avoid duplicate 'n_estimators'
test_hyperparams.pop("max_depth", None)     # Optional: Remove other duplicates if defined explicitly

from lightgbm import early_stopping

# Initialize the model
model = lgbm.LGBMClassifier(
    n_jobs=10,
    n_estimators=100,
    max_depth=5,
    **test_hyperparams
)

# Define early stopping callback
callbacks = [
    early_stopping(stopping_rounds=10, verbose=True)
]

# Fit the model
model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="logloss",
    callbacks=callbacks
)


[LightGBM] [Info] Number of positive: 786838, number of negative: 786838
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.246477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3206
[LightGBM] [Info] Number of data points in the train set: 1573676, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[69]	valid_0's binary_logloss: 0.403748


In [108]:
print("Columns in X_train but not in transaction_df:", set(X_train.columns) - set(transaction_df.columns))
print("Columns in transaction_df but not in X_train:", set(transaction_df.columns) - set(X_train.columns))

Columns in X_train but not in transaction_df: set()
Columns in transaction_df but not in X_train: set()


In [109]:
encoded_transactions = {}
for key, transaction in ulr_list.items():
    if transaction is None:
      continue
    # Convert the transaction dictionary to a DataFrame
    transaction_df = pd.DataFrame([transaction])

    # Ensure all columns in X_train exist in transaction_df
    for col in X_train.columns:
        if col not in transaction_df.columns:
            transaction_df[col] = 0  # Assign default value for missing columns

    # Drop extra columns not in X_train
    transaction_df = transaction_df[X_train.columns]

    # Encode categorical features using the same encoders used during training
    for feat in categorical_features:
        if feat in transaction_df.columns:
            transaction_df[feat] = transaction_df[feat].map(
                lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1
            )

    # Predict the label
    y_pred = model.predict(transaction_df)
    encoded_transactions[key] = y_pred[0]
    print(f"Predicted value: {y_pred[0]} for transaction: {transaction}")

# Display all predictions
print("All Predictions:", encoded_transactions)




Predicted value: 0 for transaction: {'customer_id': 10, 'income': 0.7071956167502245, 'name_email_similarity': 0.44949386791779095, 'prev_address_months_count': 334, 'current_address_months_count': 417, 'customer_age': 85, 'days_since_request': 58.933000813853994, 'intended_balcon_amount': 4.589770143408117, 'payment_type': 'AC', 'zip_count_4w': 3513, 'velocity_6h': 15249.946132255604, 'velocity_24h': 4350.335893195264, 'velocity_4w': 5916.139398584218, 'bank_branch_count_8w': 1119, 'date_of_birth_distinct_emails_4w': 30, 'employment_status': 'CE', 'credit_risk_score': 104, 'email_is_free': 1, 'housing_status': 'BA', 'phone_home_valid': 0, 'phone_mobile_valid': 0, 'bank_months_count': 20, 'has_other_cards': 0, 'proposed_credit_limit': 1020.9478977964523, 'foreign_request': 0, 'source': 'INTERNET', 'session_length_in_minutes': 29.29616860946916, 'device_os': 'macintosh', 'keep_alive_session': 1, 'device_distinct_emails_8w': 0, 'device_fraud_count': 0, 'month': 7, 'fraud_bool': None}
Pre

In [110]:
def updateData(key,fraud_bool):
    lambda_url = "https://ujxyrynieutpj6ydvnd5v2ghfy0blvsn.lambda-url.us-east-2.on.aws/"

    payload = {
        "customer_id": key,
        "fraud_bool": fraud_bool
    }

# Send the HTTP POST request with the customer_id in the body
    response = requests.post(lambda_url, json=payload)

# Check the response status
    if response.status_code == 200:
        print("Lambda function invoked successfully.")
        print("Response body:", response.json())  # Assuming the Lambda function returns a JSON response
        return response.json()
    else:
        print(f"Failed to invoke Lambda function. Status code: {response.status_code}")
        print("Error:", response.text)

In [111]:
for prediction in encoded_transactions:
    # Convert NumPy int64 to Python int
    updateData(prediction, int(encoded_transactions[prediction]))

Lambda function invoked successfully.
Response body: {'success': 'Successfully updated fraud_bool for CustomerID 10.'}
Lambda function invoked successfully.
Response body: {'success': 'Successfully updated fraud_bool for CustomerID 11.'}
Lambda function invoked successfully.
Response body: {'success': 'Successfully updated fraud_bool for CustomerID 12.'}
Lambda function invoked successfully.
Response body: {'success': 'Successfully updated fraud_bool for CustomerID 13.'}
Lambda function invoked successfully.
Response body: {'success': 'Successfully updated fraud_bool for CustomerID 14.'}
Lambda function invoked successfully.
Response body: {'success': 'Successfully updated fraud_bool for CustomerID 15.'}
Lambda function invoked successfully.
Response body: {'success': 'Successfully updated fraud_bool for CustomerID 16.'}
Lambda function invoked successfully.
Response body: {'success': 'Successfully updated fraud_bool for CustomerID 17.'}
Lambda function invoked successfully.
Response b

In [None]:
def getData