SCOPE:
* Create Target (1,0 classes) based on whether farmer paid at maturity or not
Split dataset from Inception till half of 2022
Predict probability using logistic regression

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score,f1_score,confusion_matrix
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.preprocessing import RobustScaler
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from sklearn.utils import resample

pip install --index-url=https://pypi.org/simple/ --trusted-host=pypi.org catboost

In [2]:
dataset = r'C:\Users\JenniferEbereChinabu\OneDrive - AFEX Commodities Exchange Limited\Documents\AFEX ML Credit Score\credit_score\inception_till_2022midpoint_dataset.csv'
training_dataset = pd.read_csv(dataset)

In [3]:
data = r'C:\Users\JenniferEbereChinabu\OneDrive - AFEX Commodities Exchange Limited\Documents\AFEX ML Credit Score\credit_score\post_2022_dataset.csv'
test_dataset = pd.read_csv(data)

In [5]:
# Assuming you have a dataset named 'training_dataset'
# Replace 'target_variable' with the actual target variable in your dataset
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
model_no_poly = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), LGBMClassifier(random_state=42))
model_with_poly = make_pipeline(SimpleImputer(strategy='mean'), PolynomialFeatures(degree=2), StandardScaler(), LGBMClassifier(random_state=42))

# Train and evaluate models without and with polynomial features
models = {'Without Polynomial Features': model_no_poly, 'With Polynomial Features': model_with_poly}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the training data
    y_pred_train = model.predict(X_test)

    # Make predictions on the test data
    y_pred_test = model.predict(test_x)

    # Model evaluation for training dataset
    print("="*20, f"Model: {model_name} (Training Data)", "="*20)
    print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
    print('Classification Report:\n', classification_report(y_test, y_pred_train))
    print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

    # Model evaluation for test dataset
    print("="*20, f"Model: {model_name} (Test Data)", "="*20)
    print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
    print('Classification Report:\n', classification_report(test_y, y_pred_test))
    print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

[LightGBM] [Info] Number of positive: 9007, number of negative: 183339
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3352
[LightGBM] [Info] Number of data points in the train set: 192346, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.046827 -> initscore=-3.013335
[LightGBM] [Info] Start training from score -3.013335
Accuracy: 0.9991889699918897
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45877
           1       0.99      0.99      0.99      2210

    accuracy                           1.00     48087
   macro avg       0.99      1.00      1.00     48087
weighted avg       1.00      1.00      1.00     48087

F1 Score (Training Data): 0.991206313416009

Accuracy: 0.995088957506249
Classification Report:
               precision    r

In [6]:
# Define numerical features and target variable
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM model
lgb_model = LGBMClassifier(random_state=42)

# Train the model
lgb_model.fit(X_train, y_train)

# Make predictions on the training data
y_pred_train_lgb = lgb_model.predict(X_test)

# Make predictions on the test data
y_pred_test_lgb = lgb_model.predict(test_x)

# Model evaluation for training dataset
print("="*20, "LightGBM", "="*20)
print("Training Data:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_train_lgb)}')
print('Classification Report:\n', classification_report(y_test, y_pred_train_lgb))
print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train_lgb)}\n')

# Model evaluation for test dataset
print("Test Data:")
print(f'Accuracy: {accuracy_score(test_y, y_pred_test_lgb)}')
print('Classification Report:\n', classification_report(test_y, y_pred_test_lgb))
print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test_lgb)}\n')

[LightGBM] [Info] Number of positive: 9007, number of negative: 183339
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3135
[LightGBM] [Info] Number of data points in the train set: 192346, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.046827 -> initscore=-3.013335
[LightGBM] [Info] Start training from score -3.013335
Training Data:
Accuracy: 0.9993345394805249
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45877
           1       0.99      1.00      0.99      2210

    accuracy                           1.00     48087
   macro avg       0.99      1.00      1.00     48087
weighted avg       1.00      1.00      1.00     48087

F1 Score (Training Data): 0.9927992799279928



In [4]:
# Define numerical features and target variable
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Bagging': BaggingClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0)  # CatBoost is categorical-feature-friendly
}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the training data
    y_pred_train = model.predict(X_test)

    # Make predictions on the test data
    y_pred_test = model.predict(test_x)

    # Model evaluation for training dataset
    print(f"{'='*20} {model_name} {'='*20}")
    print("Training Data:")
    print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
    print('Classification Report:\n', classification_report(y_test, y_pred_train))
    print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

    # Model evaluation for test dataset
    print("Test Data:")
    print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
    print('Classification Report:\n', classification_report(test_y, y_pred_test))
    print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

Training Data:
Accuracy: 0.9994385176866929
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45877
           1       0.99      0.99      0.99      2210

    accuracy                           1.00     48087
   macro avg       1.00      1.00      1.00     48087
weighted avg       1.00      1.00      1.00     48087

F1 Score (Training Data): 0.9938955460094957

Test Data:
Accuracy: 0.9960005881488017
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33737
           1       1.00      0.49      0.66       268

    accuracy                           1.00     34005
   macro avg       1.00      0.75      0.83     34005
weighted avg       1.00      1.00      1.00     34005

F1 Score (Test Data): 0.66

Training Data:
Accuracy: 0.9994177220454593
Classification Report:
               precision    recall  f1-score   support

           0     

In [None]:
# Define numerical features and target variable
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Assuming 'X_train' and 'X_test' are your feature matrices
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
test_x_poly = poly.transform(test_x)

# List of classifiers
classifiers = [
    ('RandomForest', RandomForestClassifier(random_state=42)),
    ('GradientBoosting', GradientBoostingClassifier(random_state=42)),
    ('AdaBoost', AdaBoostClassifier(random_state=42)),
    ('Bagging', BaggingClassifier(base_estimator=LGBMClassifier(random_state=42), random_state=42))
]

# Iterate through classifiers
for clf_name, clf in classifiers:
    # Train the model with polynomial features
    clf.fit(X_train_poly, y_train)

    # Make predictions on the training data
    y_pred_train = clf.predict(X_test_poly)

    # Make predictions on the test data
    y_pred_test = clf.predict(test_x_poly)

    # Model evaluation for training dataset
    print("="*20, f"{clf_name} with Polynomial Features", "="*20)
    print("Training Data:")
    print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
    print('Classification Report:\n', classification_report(y_test, y_pred_train))
    print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

    # Model evaluation for test dataset
    print("Test Data:")
    print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
    print('Classification Report:\n', classification_report(test_y, y_pred_test))
    print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

In [10]:
# Define numerical features and target variable
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Assuming 'X_train' and 'X_test' are your feature matrices
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
test_x_poly = poly.transform(test_x)

# Define model
model_rf = RandomForestClassifier(random_state=42)

# Train the model
model_rf.fit(X_train_poly, y_train)

# Make predictions on the training data
y_pred_train = model_rf.predict(X_test_poly)

# Make predictions on the test data
y_pred_test = model_rf.predict(test_x_poly)

# Model evaluation for training dataset
print("="*20, "RandomForestClassifier (Training Data)", "="*20)
print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
print('Classification Report:\n', classification_report(y_test, y_pred_train))
print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

# Model evaluation for test dataset
print("="*20, "RandomForestClassifier (Test Data)", "="*20)
print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
print('Classification Report:\n', classification_report(test_y, y_pred_test))
print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

Accuracy: 0.9995840871753281
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45877
           1       0.99      1.00      1.00      2210

    accuracy                           1.00     48087
   macro avg       1.00      1.00      1.00     48087
weighted avg       1.00      1.00      1.00     48087

F1 Score (Training Data): 0.9954853273137697

Accuracy: 0.9965299220702838
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33737
           1       1.00      0.56      0.72       268

    accuracy                           1.00     34005
   macro avg       1.00      0.78      0.86     34005
weighted avg       1.00      1.00      1.00     34005

F1 Score (Test Data): 0.7177033492822966



In [None]:
# Define numerical features and target variable
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Assuming 'X_train' and 'X_test' are your feature matrices
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
test_x_poly = poly.transform(test_x)

# Define model
model_gbc = GradientBoostingClassifier(random_state=42)

# Train the model
model_gbc.fit(X_train_poly, y_train)

# Make predictions on the training data
y_pred_train = model_gbc.predict(X_test_poly)

# Make predictions on the test data
y_pred_test = model_gbc.predict(test_x_poly)

# Model evaluation for training dataset
print("="*20, "GradientBoostingClassifier (Training Data)", "="*20)
print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
print('Classification Report:\n', classification_report(y_test, y_pred_train))
print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

# Model evaluation for test dataset
print("="*20, "GradientBoostingClassifier (Test Data)", "="*20)
print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
print('Classification Report:\n', classification_report(test_y, y_pred_test))
print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

In [17]:
# Define numerical features and target variable
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Assuming 'X_train' and 'X_test' are your feature matrices
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
test_x_poly = poly.transform(test_x)

# Define model
model_abc = AdaBoostClassifier(random_state=42)

# Train the model
model_abc.fit(X_train_poly, y_train)

# Make predictions on the training data
y_pred_train = model_abc.predict(X_test_poly)

# Make predictions on the test data
y_pred_test = model_abc.predict(test_x_poly)

# Model evaluation for training dataset
print("="*20, "AdaBoostClassifier (Training Data)", "="*20)
print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
print('Classification Report:\n', classification_report(y_test, y_pred_train))
print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

# Model evaluation for test dataset
print("="*20, "AdaBoostClassifier (Test Data)", "="*20)
print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
print('Classification Report:\n', classification_report(test_y, y_pred_test))
print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

Accuracy: 0.9981491879302098
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45877
           1       0.98      0.98      0.98      2210

    accuracy                           1.00     48087
   macro avg       0.99      0.99      0.99     48087
weighted avg       1.00      1.00      1.00     48087

F1 Score (Training Data): 0.9798140167838512

Accuracy: 0.993353918541391
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     33737
           1       0.85      0.19      0.31       268

    accuracy                           0.99     34005
   macro avg       0.92      0.60      0.65     34005
weighted avg       0.99      0.99      0.99     34005

F1 Score (Test Data): 0.31097560975609756



In [15]:
# Define numerical features and target variable
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Assuming 'X_train' and 'X_test' are your feature matrices
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
test_x_poly = poly.transform(test_x)

# Define model
model_bc = BaggingClassifier(random_state=42)

# Train the model
model_bc.fit(X_train_poly, y_train)

# Make predictions on the training data
y_pred_train = model_bc.predict(X_test_poly)

# Make predictions on the test data
y_pred_test = model_bc.predict(test_x_poly)

# Model evaluation for training dataset
print("="*20, "BaggingClassifier (Training Data)", "="*20)
print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
print('Classification Report:\n', classification_report(y_test, y_pred_train))
print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

# Model evaluation for test dataset
print("="*20, "BaggingClassifier (Test Data)", "="*20)
print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
print('Classification Report:\n', classification_report(test_y, y_pred_test))
print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

Accuracy: 0.9994385176866929
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45877
           1       0.99      1.00      0.99      2210

    accuracy                           1.00     48087
   macro avg       1.00      1.00      1.00     48087
weighted avg       1.00      1.00      1.00     48087

F1 Score (Training Data): 0.9938983050847457

Accuracy: 0.9967357741508601
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33737
           1       0.96      0.61      0.75       268

    accuracy                           1.00     34005
   macro avg       0.98      0.80      0.87     34005
weighted avg       1.00      1.00      1.00     34005

F1 Score (Test Data): 0.7459954233409613



In [13]:
# Define numerical features and target variable
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Assuming 'X_train' and 'X_test' are your feature matrices
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
test_x_poly = poly.transform(test_x)

# Define model
model_blgb = BaggingClassifier(base_estimator=LGBMClassifier(random_state=42), random_state=42)

# Train the model
model_blgb.fit(X_train_poly, y_train)

# Make predictions on the training data
y_pred_train = model_blgb.predict(X_test_poly)

# Make predictions on the test data
y_pred_test = model_blgb.predict(test_x_poly)

# Model evaluation for training dataset
print("="*20, "BaggingClassifier (Training Data)", "="*20)
print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
print('Classification Report:\n', classification_report(y_test, y_pred_train))
print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

# Model evaluation for test dataset
print("="*20, "BaggingClassifier (Test Data)", "="*20)
print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
print('Classification Report:\n', classification_report(test_y, y_pred_test))
print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')



[LightGBM] [Info] Number of positive: 9007, number of negative: 183339
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.462434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84526
[LightGBM] [Info] Number of data points in the train set: 192346, number of used features: 791
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047305 -> initscore=-3.002670
[LightGBM] [Info] Start training from score -3.002670
[LightGBM] [Info] Number of positive: 9007, number of negative: 183339
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.501788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84526
[LightGBM] [Info] Number of data points in the train set: 192346, number of used features: 791
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047597 -> initscore=-2.996229
[LightGBM] [Info] Start training from score -2.996229
[Light

In [33]:
# Assuming you have a dataset named 'training_dataset'
# Replace 'target_variable' with the actual target variable in your dataset
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
model_no_poly = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), BaggingClassifier(random_state=42))
model_with_poly = make_pipeline(SimpleImputer(strategy='mean'), PolynomialFeatures(degree=2), StandardScaler(), BaggingClassifier(random_state=42))

# Train and evaluate models without and with polynomial features
models = {'Without Polynomial Features': model_no_poly, 'With Polynomial Features': model_with_poly}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the training data
    y_pred_train = model.predict(X_test)

    # Make predictions on the test data
    y_pred_test = model.predict(test_x)

    # Model evaluation for training dataset
    print("="*20, f"Model: {model_name} (Training Data)", "="*20)
    print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
    print('Classification Report:\n', classification_report(y_test, y_pred_train))
    print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

    # Model evaluation for test dataset
    print("="*20, f"Model: {model_name} (Test Data)", "="*20)
    print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
    print('Classification Report:\n', classification_report(test_y, y_pred_test))
    print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

Accuracy: 0.9994593133279265
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45877
           1       0.99      1.00      0.99      2210

    accuracy                           1.00     48087
   macro avg       1.00      1.00      1.00     48087
weighted avg       1.00      1.00      1.00     48087

F1 Score (Training Data): 0.9941282746160794

Accuracy: 0.9971180708719306
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33737
           1       0.97      0.66      0.78       268

    accuracy                           1.00     34005
   macro avg       0.98      0.83      0.89     34005
weighted avg       1.00      1.00      1.00     34005

F1 Score (Test Data): 0.7822222222222223

Accuracy: 0.9994593133279265
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00 

In [4]:
# Assuming you have a dataset named 'training_dataset'
# Replace 'target_variable' with the actual target variable in your dataset
numerical_features = ['farm_size', 'bags', 'gross_weight', 'net_weight', 'moisture', 'total_commodity_price', 'price_per_tonne', 'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg',
'interest', 'admin_fee', 'equity', 'to_balance', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans', 'avg_loan_repayment_rate', 'time_since_last_loan', 'time_since_last_loan_month', 'percentage_unrepaid_loans']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
model_no_poly = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), BaggingClassifier(random_state=42))
model_with_poly = make_pipeline(SimpleImputer(strategy='mean'), PolynomialFeatures(degree=2), StandardScaler(), BaggingClassifier(random_state=42))

# Train and evaluate models without and with polynomial features
models = {'Without Polynomial Features': model_no_poly, 'With Polynomial Features': model_with_poly}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the training data
    y_pred_train = model.predict(X_test)

    # Make predictions on the test data
    y_pred_test = model.predict(test_x)

    # Model evaluation for training dataset
    print("="*20, f"Model: {model_name} (Training Data)", "="*20)
    print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
    print('Classification Report:\n', classification_report(y_test, y_pred_train))
    print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

    # Model evaluation for test dataset
    print("="*20, f"Model: {model_name} (Test Data)", "="*20)
    print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
    print('Classification Report:\n', classification_report(test_y, y_pred_test))
    print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

Accuracy: 0.9926175473620729
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     45877
           1       0.95      0.88      0.92      2210

    accuracy                           0.99     48087
   macro avg       0.97      0.94      0.96     48087
weighted avg       0.99      0.99      0.99     48087

F1 Score (Training Data): 0.9166079398637538

Accuracy: 0.998794294956624
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33737
           1       1.00      0.85      0.92       268

    accuracy                           1.00     34005
   macro avg       1.00      0.92      0.96     34005
weighted avg       1.00      1.00      1.00     34005

F1 Score (Test Data): 0.9171717171717172

Accuracy: 0.9926591386445401
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00  

In [None]:
# Assuming you have datasets named 'training_dataset' and 'test_dataset'
# Replace 'target_variable' with the actual target variable in your dataset
numerical_features = ['farm_size', 'bags', 'gross_weight', 'net_weight', 'moisture', 'total_commodity_price', 'price_per_tonne', 'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg',
                      'interest', 'admin_fee', 'equity', 'to_balance', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans', 'avg_loan_repayment_rate', 'time_since_last_loan', 'time_since_last_loan_month', 'percentage_unrepaid_loans']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
model_no_poly = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), BaggingClassifier(random_state=42))
model_with_poly = make_pipeline(SimpleImputer(strategy='mean'), PolynomialFeatures(degree=2), StandardScaler(), BaggingClassifier(random_state=42))

# Train and evaluate models without and with polynomial features
models = {'Without Polynomial Features': model_no_poly, 'With Polynomial Features': model_with_poly}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the training data
    y_pred_train = model.predict(X_train)

    # Make predictions on the test data
    y_pred_test = model.predict(test_x)

    # Model evaluation for training dataset
    print("="*20, f"Model: {model_name} (Training Data)", "="*20)
    print(f'Accuracy: {accuracy_score(y_train, y_pred_train)}')
    print('Classification Report:\n', classification_report(y_train, y_pred_train))
    print(f'F1 Score (Training Data): {f1_score(y_train, y_pred_train)}\n')

    # Model evaluation for test dataset
    print("="*20, f"Model: {model_name} (Test Data)", "="*20)
    print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
    print('Classification Report:\n', classification_report(test_y, y_pred_test))
    print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

    # Merge binary predictions back into the test dataset
    test_dataset[f'{model_name}_prediction'] = y_pred_test

    # Get predicted probabilities for the test dataset
    y_prob_test = model.predict_proba(test_x)[:, 1]

    # Transform the probabilities to a scale between 100 and 800
    min_score = 100
    max_score = 800

    # Scale the probabilities to the desired range
    scaled_scores = min_score + (max_score - min_score) * y_prob_test

    # Add the scores to the test dataset
    test_dataset[f'{model_name}_score'] = scaled_scores

# Display the resulting test dataset with predictions and scores
print(test_dataset)


In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, f1_score
import numpy as np

# Assuming 'X' is your feature matrix and 'y' is your target variable
# Replace 'X' and 'y' with your actual variable names

numerical_features = ['farm_size', 'bags', 'gross_weight', 'net_weight', 'moisture', 'total_commodity_price', 'price_per_tonne', 'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg',
'interest', 'admin_fee', 'equity', 'to_balance', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans', 'avg_loan_repayment_rate', 'time_since_last_loan', 'time_since_last_loan_month', 'percentage_unrepaid_loans']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# 1. Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Check for class imbalance and perform resampling if needed
# Assuming '1' is the minority class
X_resampled, y_resampled = resample(X_train[y_train == 1], y_train[y_train == 1], n_samples=len(X_train[y_train == 0]), random_state=42)
X_train_balanced = np.concatenate([X_train, X_resampled])
y_train_balanced = np.concatenate([y_train, y_resampled])

# 3. Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_balanced_imputed = imputer.fit_transform(X_train_balanced)
X_test_imputed = imputer.transform(X_test)
test_x_imputed = imputer.transform(test_x)

# 4. Train a Random Forest Classifier with hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_balanced_imputed, y_train_balanced)

# 5. Evaluate the model on the test set
best_rf_model = grid_search.best_estimator_
y_pred_test = best_rf_model.predict(X_test_imputed)

print("Best Parameters:", grid_search.best_params_)
print("Accuracy on Test Data:", accuracy_score(y_test, y_pred_test))
print("Classification Report on Test Data:\n", classification_report(y_test, y_pred_test))
print("F1 Score on Test Data:", f1_score(y_test, y_pred_test))

# 6. Check feature importances
feature_importances = best_rf_model.feature_importances_
# Plot or print feature importances as needed

# 7. Cross-validation for a more robust estimate of performance
cv_scores = cross_val_score(best_rf_model, X_train_balanced_imputed, y_train_balanced, cv=5, scoring='f1')
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", np.mean(cv_scores))

# 8. Optional: Polynomial Features
poly_model = make_pipeline(PolynomialFeatures(degree=2), RandomForestClassifier(random_state=42))
poly_model.fit(X_train_balanced_imputed, y_train_balanced)
y_pred_poly_test = poly_model.predict(test_x_imputed)

print("Accuracy on Test Data (with Polynomial Features):", accuracy_score(test_y, y_pred_poly_test))
print("Classification Report on Test Data (with Polynomial Features):\n", classification_report(test_y, y_pred_poly_test))
print("F1 Score on Test Data (with Polynomial Features):", f1_score(test_y, y_pred_poly_test))



In [8]:
# Assuming you have a dataset named 'training_dataset'
# Replace 'target_variable' with the actual target variable in your dataset
numerical_features = ['is_deleted', 'gender', 'farm_size', 'is_blacklist', 'phone_invalid',
'phone_number_status', 'coordinate_status', 'id_status', 'project_id',
'hectare', 'total_loan_value', 'repayment_value', 'amount_repaid', 'insurance', 'crg', 'interest', 'admin_fee', 'equity',
'to_balance', 'is_repaid', 'loan_approved', 'loan_approval_completed', 'loan_rejected', 'loan_reverted',
'marital_status_Divorced', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widow',
'marital_status_Widower', 'transaction_type_Broker Payment', 'transaction_type_Com For Equity', 'transaction_type_Com To Input',
'transaction_type_Loan Repayment', 'transaction_type_Storage', 'transaction_type_Storage To Trade', 'transaction_type_Trade',
'payment_option_Cash Advance', 'payment_option_Trade Execution', 'debt_to_farm_size_ratio', 'total_loan_value_total', 'total_loans',
'avg_loan_repayment_rate', 'time_since_last_loan', 'percentage_unrepaid_loans', 'time_since_last_loan_month']
target_variable = 'fully_repaid_within_maturity'

# Select features and target variable
X = training_dataset[numerical_features]
y = training_dataset[target_variable]

test_x = test_dataset[numerical_features]
test_y = test_dataset[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
model_no_poly = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), BaggingClassifier(random_state=42))
model_with_poly = make_pipeline(SimpleImputer(strategy='mean'), PolynomialFeatures(degree=2), StandardScaler(), BaggingClassifier(random_state=42))

# Define hyperparameter grid for tuning
param_dist = {
    'baggingclassifier__n_estimators': [50, 100, 200],
    'baggingclassifier__max_samples': [0.5, 0.7, 1.0]
}

# Define RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(model_with_poly, param_dist, n_iter=10, cv=5, scoring='f1', n_jobs=-1)

# Train and evaluate models without and with polynomial features
models = {'Without Polynomial Features': model_no_poly, 'With Polynomial Features': random_search}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the training data
    y_pred_train = model.predict(X_test)

    # Make predictions on the test data
    y_pred_test = model.predict(test_x)

    # Model evaluation for training dataset
    print("="*20, f"Model: {model_name} (Training Data)", "="*20)
    print(f'Accuracy: {accuracy_score(y_test, y_pred_train)}')
    print('Classification Report:\n', classification_report(y_test, y_pred_train))
    print(f'F1 Score (Training Data): {f1_score(y_test, y_pred_train)}\n')

    # Model evaluation for test dataset
    print("="*20, f"Model: {model_name} (Test Data)", "="*20)
    print(f'Accuracy: {accuracy_score(test_y, y_pred_test)}')
    print('Classification Report:\n', classification_report(test_y, y_pred_test))
    print(f'F1 Score (Test Data): {f1_score(test_y, y_pred_test)}\n')

     # Confusion Matrix for test dataset
    cm = confusion_matrix(test_y, y_pred_test)
    print('Confusion Matrix (Test Data):')
    print(cm)
    print("\n")

Accuracy: 0.9994593133279265
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45877
           1       0.99      1.00      0.99      2210

    accuracy                           1.00     48087
   macro avg       1.00      1.00      1.00     48087
weighted avg       1.00      1.00      1.00     48087

F1 Score (Training Data): 0.9941282746160794

Accuracy: 0.9971180708719306
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33737
           1       0.97      0.66      0.78       268

    accuracy                           1.00     34005
   macro avg       0.98      0.83      0.89     34005
weighted avg       1.00      1.00      1.00     34005

F1 Score (Test Data): 0.7822222222222223

Confusion Matrix (Test Data):
[[33731     6]
 [   92   176]]




