In [1]:
import pandas as pd
import sys
import os

# Add the scripts directory to the Python path
sys.path.append(os.path.abspath('../scripts'))

In [2]:

# Load the data_filledset
from credit_scoring_model_development import load_dataset


# Load the dataset
file_path = r'C:\Users\Naim\credit-scoring-model-development\data\data.csv'
data = load_dataset(file_path)

In [3]:
from EDA import fill_outliers
# Columns to fill outliers for
columns_to_fill = ['Amount', 'Value', 'PricingStrategy', 'FraudResult']

# Fill outliers in the specified columns
data_filled = fill_outliers(data, columns=columns_to_fill)

# Check the changes in the data_filledset
print("data_filledset after filling outliers:")
print(data_filled[columns_to_fill].describe())

Column 'Amount': Outliers filled using capping method with lower bound = -4325.00 and upper bound = 7075.00.
Column 'Value': Outliers filled using capping method with lower bound = -6812.50 and upper bound = 12087.50.
Column 'PricingStrategy': Outliers filled using capping method with lower bound = 2.00 and upper bound = 2.00.
Column 'FraudResult': Outliers filled using capping method with lower bound = 0.00 and upper bound = 0.00.
data_filledset after filling outliers:
             Amount         Value  PricingStrategy  FraudResult
count  95662.000000  95662.000000          95662.0      95662.0
mean    1449.051201   3403.667282              2.0          0.0
std     3210.808287   4168.243292              0.0          0.0
min    -4325.000000      2.000000              2.0          0.0
25%      -50.000000    275.000000              2.0          0.0
50%     1000.000000   1000.000000              2.0          0.0
75%     2800.000000   5000.000000              2.0          0.0
max     7075.

# Create Aggregate Features

In [4]:
from credit_scoring_model_development import create_advanced_aggregate_features
customer_advanced_features = create_advanced_aggregate_features(data_filled)

# Display the first few rows of the aggregated feature set
print("Customer Advanced Aggregate Features:")
print(customer_advanced_features.head())

Customer Advanced Aggregate Features:
        CustomerId  Total_Transaction_Amount  Average_Transaction_Amount  \
0     CustomerId_1                   -4325.0                -4325.000000   
1    CustomerId_10                   -4325.0                -4325.000000   
2  CustomerId_1001                   14825.0                 2965.000000   
3  CustomerId_1002                    4225.0                  384.090909   
4  CustomerId_1003                   14825.0                 2470.833333   

   Transaction_Count  Std_Dev_Transaction_Amount  Max_Transaction_Amount  \
0                  1                         NaN                 -4325.0   
1                  1                         NaN                 -4325.0   
2                  5                 5048.285600                  7075.0   
3                 11                  560.498966                  1500.0   
4                  6                 4597.537837                  7075.0   

   Min_Transaction_Amount  Median_Transaction_Am

# Extract Features

In [5]:
# Import the function from the script file
from credit_scoring_model_development import extract_datetime_features

# Assuming the data set has already been loaded and is stored in `data_filled`
data_filled = extract_datetime_features(data_filled, timestamp_column='TransactionStartTime')

# Display the first few rows of the data_filledset with new features
print("data_filledset with extracted datetime features:")
print(data_filled[['TransactionStartTime', 'Transaction_Hour', 'Transaction_Day', 'Transaction_Month', 'Transaction_Year', 'Transaction_DayOfWeek', 'Transaction_WeekOfYear']].head())


Datetime features extracted successfully.
data_filledset with extracted datetime features:
       TransactionStartTime  Transaction_Hour  Transaction_Day  \
0 2018-11-15 02:18:49+00:00                 2               15   
1 2018-11-15 02:19:08+00:00                 2               15   
2 2018-11-15 02:44:21+00:00                 2               15   
3 2018-11-15 03:32:55+00:00                 3               15   
4 2018-11-15 03:34:21+00:00                 3               15   

   Transaction_Month  Transaction_Year  Transaction_DayOfWeek  \
0                 11              2018                      3   
1                 11              2018                      3   
2                 11              2018                      3   
3                 11              2018                      3   
4                 11              2018                      3   

   Transaction_WeekOfYear  
0                      46  
1                      46  
2                      46  
3        

In [6]:
print(data_filled.dtypes)

TransactionId                          object
BatchId                                object
AccountId                              object
SubscriptionId                         object
CustomerId                             object
CurrencyCode                           object
CountryCode                             int64
ProviderId                             object
ProductId                              object
ProductCategory                        object
ChannelId                              object
Amount                                float64
Value                                 float64
TransactionStartTime      datetime64[ns, UTC]
PricingStrategy                       float64
FraudResult                           float64
Transaction_Hour                        int32
Transaction_Day                         int32
Transaction_Month                       int32
Transaction_Year                        int32
Transaction_DayOfWeek                   int32
Transaction_WeekOfYear            

In [7]:
# Import the functions from the script file
from credit_scoring_model_development import encode_all_categorical

print("data_filledset before encoding:")
print(data_filled.head())

# Encode all categorical columns
data_filled_encoded = encode_all_categorical(data_filled, max_unique_values=10)

# Display the first few rows of the encoded data_filledset
print("data_filledset after encoding all categorical columns:")
print(data_filled_encoded.head())



data_filledset before encoding:
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory  ...    Value 

In [8]:
# Check the data_filled types of all columns
print("data_filled types of columns:")
print(data_filled_encoded.dtypes)

# Identify columns that are still of object type (potentially not encoded)
remaining_categorical_columns = data_filled_encoded.select_dtypes(include='object').columns
print(f"Remaining categorical columns (not encoded): {list(remaining_categorical_columns)}")


data_filled types of columns:
TransactionId                                       int32
BatchId                                             int32
AccountId                                           int32
SubscriptionId                                      int32
CustomerId                                          int32
CountryCode                                         int64
ProductId                                           int32
Amount                                            float64
Value                                             float64
TransactionStartTime                  datetime64[ns, UTC]
PricingStrategy                                   float64
FraudResult                                       float64
Transaction_Hour                                    int32
Transaction_Day                                     int32
Transaction_Month                                   int32
Transaction_Year                                    int32
Transaction_DayOfWeek                     

In [9]:
from credit_scoring_model_development import missing_values_summary
# Check the missing values in the data_filledset
missing_summary = missing_values_summary(data_filled_encoded)
print(missing_summary)

Columns with Missing Values:
Empty DataFrame
Columns: [Missing Values, % of Total Values]
Index: []


In [10]:
# Import the scaling function from the script
from credit_scoring_model_development import scale_numerical_features

# Normalize all numerical columns
data_filled_normalized = scale_numerical_features(data_filled_encoded.copy(), method='normalization')

# Display the first few rows of the normalized data_filledset
print("Normalized data_filledset:")
print(data_filled_normalized.head())


Scaling method: normalization
Columns to scale: Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CountryCode', 'ProductId', 'Amount', 'Value', 'PricingStrategy',
       'FraudResult', 'Transaction_Hour', 'Transaction_Day',
       'Transaction_Month', 'Transaction_Year', 'Transaction_DayOfWeek',
       'Transaction_WeekOfYear'],
      dtype='object')
Numerical columns scaled using normalization.
Normalized data_filledset:
   TransactionId   BatchId  AccountId  SubscriptionId  CustomerId  \
0       0.816947  0.495528   0.685573        0.974903    0.690724   
1       0.792601  0.334940   0.886289        0.652510    0.690724   
2       0.416972  0.635727   0.746971        0.274683    0.750067   
3       0.509487  0.018954   0.922632        0.268616    0.997862   
4       0.432402  0.516212   0.886289        0.652510    0.997862   

   CountryCode  ProductId    Amount     Value      TransactionStartTime  ...  \
0          0.0   0.045455  0.467105  0.08

In [11]:

# Standardize all numerical columns
data_filled_standardized = scale_numerical_features(data_filled_encoded.copy(), method='standardization')

# Display the first few rows of the standardized data_filledset
print("Standardized data_filledset:")
print(data_filled_standardized.head())


Scaling method: standardization
Columns to scale: Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CountryCode', 'ProductId', 'Amount', 'Value', 'PricingStrategy',
       'FraudResult', 'Transaction_Hour', 'Transaction_Day',
       'Transaction_Month', 'Transaction_Year', 'Transaction_DayOfWeek',
       'Transaction_WeekOfYear'],
      dtype='object')
Numerical columns scaled using standardization.
Standardized data_filledset:
   TransactionId   BatchId  AccountId  SubscriptionId  CustomerId  \
0       1.097926 -0.015563   0.153713        1.668646    0.595070   
1       1.013589 -0.571733   0.867190        0.346449    0.595070   
2      -0.287613  0.469994   0.371965       -1.203089    0.807169   
3       0.032862 -1.666100   0.996380       -1.227972    1.692827   
4      -0.234164  0.056072   0.867190        0.346449    1.692827   

   CountryCode  ProductId    Amount     Value      TransactionStartTime  ...  \
0          0.0  -1.578054 -0.139857

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the distribution of a selected feature before and after scaling
feature = 'Amount'
plt.figure(figsize=(12, 6))

# Original Distribution
plt.subplot(1, 2, 1)
sns.histplot(data_filled_encoded[feature], bins=30, kde=True, color='blue')
plt.title(f'Original Distribution of {feature}')

# Normalized Distribution
plt.subplot(1, 2, 2)
sns.histplot(data_filled_normalized[feature], bins=30, kde=True, color='green')
plt.title(f'Normalized Distribution of {feature}')

plt.tight_layout()
plt.show()


  plt.show()


In [13]:
# Find columns with only two unique values
binary_columns = [col for col in data_filled_encoded.columns if data_filled_encoded[col].nunique() == 2]
print("Potential Binary Target Variables:", binary_columns)



Potential Binary Target Variables: ['Transaction_Year', 'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6', 'ProductCategory_data_bundles', 'ProductCategory_financial_services', 'ProductCategory_movies', 'ProductCategory_other', 'ProductCategory_ticket', 'ProductCategory_transport', 'ProductCategory_tv', 'ProductCategory_utility_bill', 'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3', 'ChannelId_ChannelId_5']


In [14]:
# Loop through each potential binary column and print the distribution
for col in binary_columns:
    print(f"Distribution of {col}:")
    print(data_filled_encoded[col].value_counts(), "\n")


Distribution of Transaction_Year:
Transaction_Year
2018    48107
2019    47555
Name: count, dtype: int64 

Distribution of ProviderId_ProviderId_2:
ProviderId_ProviderId_2
False    95644
True        18
Name: count, dtype: int64 

Distribution of ProviderId_ProviderId_3:
ProviderId_ProviderId_3
False    92578
True      3084
Name: count, dtype: int64 

Distribution of ProviderId_ProviderId_4:
ProviderId_ProviderId_4
False    57473
True     38189
Name: count, dtype: int64 

Distribution of ProviderId_ProviderId_5:
ProviderId_ProviderId_5
False    81120
True     14542
Name: count, dtype: int64 

Distribution of ProviderId_ProviderId_6:
ProviderId_ProviderId_6
False    61476
True     34186
Name: count, dtype: int64 

Distribution of ProductCategory_data_bundles:
ProductCategory_data_bundles
False    94049
True      1613
Name: count, dtype: int64 

Distribution of ProductCategory_financial_services:
ProductCategory_financial_services
False    50257
True     45405
Name: count, dtype: int64 



In [15]:

from credit_scoring_model_development import woe_binning, calculate_rfms, normalize_features, standardize_features


# Step 1: Perform WoE Binning
# Define the new target variable
new_target_variable = 'ProductCategory_financial_services'

# Perform WoE binning using the new target variable
data_woe, iv_df = woe_binning(data_filled_encoded, target=new_target_variable)

# Display the transformed dataset and Information Value
print(f"WoE Transformed Data (Target: {new_target_variable}):\n", data_woe.head())
print(f"IV Values for Each Feature (Target: {new_target_variable}):\n", iv_df)

# Step 2: Calculate RFMS Scores
rfms_df = calculate_rfms(data_filled_encoded, group_by_column='CustomerId')
print("RFMS Scores:\n", rfms_df.head())

# Step 3: Normalize or Standardize Columns
# Choose columns to scale
columns_to_scale = ['Amount', 'Value', 'Transaction_Hour']
data_normalized = normalize_features(data_woe, columns_to_scale)
data_standardized = standardize_features(data_woe, columns_to_scale)

# Display results
print("Normalized Data:\n", data_normalized.head())
print("Standardized Data:\n", data_standardized.head())
# Convert CustomerId in data_woe to integer



WoE Transformed Data (Target: ProductCategory_financial_services):
    TransactionId  BatchId  AccountId  SubscriptionId  CustomerId  CountryCode  \
0            0.0      0.0  -0.255152       -0.255152    0.217355    -0.000002   
1            0.0      0.0  10.439841       10.494541    0.217355    -0.000002   
2            0.0      0.0  -0.997089       -0.997089   -0.997089    -0.000002   
3            0.0      0.0  -1.690236       -1.690236   -0.409302    -0.000002   
4            0.0      0.0  10.439841       10.494541   -0.409302    -0.000002   

   ProductId    Amount     Value  TransactionStartTime  ...  \
0  -9.539625 -2.532405 -1.558792                   0.0  ...   
1  10.494695  7.968246  7.970925                   0.0  ...   
2  -7.866103 -2.219860 -1.554708                   0.0  ...   
3  -7.220326 -0.229813 -0.587969                   0.0  ...   
4  10.494695  5.692510  5.692510                   0.0  ...   

   ProductCategory_data_bundles  ProductCategory_movies  \
0      

In [16]:
from sklearn.model_selection import train_test_split

# Define the feature columns and the target column
# Use rfms_df directly as your feature set
X = rfms_df[['Frequency_Score', 'Monetary_Score', 'Recency_Score', 'RFMS_Score']] 
y = rfms_df['Default_Indicator'].map({'Good': 0, 'Bad': 1})  # Convert to binary values for modeling

# Split the data into train and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm the split
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)




Training data shape: (2993, 4) (2993,)
Testing data shape: (749, 4) (749,)


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Instantiate the models
log_reg = LogisticRegression(random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Train the models
log_reg.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

print("Model Training Completed!")


Model Training Completed!


In [18]:
from sklearn.metrics import classification_report, accuracy_score

# Predictions
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Print evaluation metrics
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_log_reg))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf)}")


Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       374
           1       0.99      0.97      0.98       375

    accuracy                           0.98       749
   macro avg       0.98      0.98      0.98       749
weighted avg       0.98      0.98      0.98       749

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       374
           1       1.00      0.99      1.00       375

    accuracy                           1.00       749
   macro avg       1.00      1.00      1.00       749
weighted avg       1.00      1.00      1.00       749

Logistic Regression Accuracy: 0.9799732977303071
Random Forest Accuracy: 0.9973297730307076


In [19]:
print(log_reg.predict(X_test))

[0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 1 1 1 0
 1 0 0 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0
 0 1 1 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 0 0 0 1 1 0
 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 0 1 0 0 0 1 0
 0 0 1 1 1 1 0 1 1 0 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0 0 1
 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0
 0 0 0 1 1 0 0 1 0 1 0 1 0 1 1 0 0 1 1 0 1 1 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1
 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 1 0 0 1 1 0 1 1 1 0 1 0 1 1 0 1
 1 1 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0 1 0 0 1 1 1 1 0 0 0
 1 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 1 1 1 0 1 0 1 0 0 0 1 1 1 1 0
 1 0 0 1 0 1 0 1 1 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0
 0 1 0 0 1 0 0 1 1 1 0 0 

In [20]:
y_test

3450    0
1114    0
351     0
1983    0
2321    0
       ..
3416    0
3389    0
612     0
1551    0
1084    1
Name: Default_Indicator, Length: 749, dtype: int64

In [21]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

# Hyperparameters for Logistic Regression
log_reg_params = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  # Regularization type
    'C': np.logspace(-4, 4, 20),  # Regularization strength
    'solver': ['liblinear', 'lbfgs', 'saga'],  # Solvers to use
    'max_iter': [100, 200, 300]  # Maximum number of iterations
}

# Hyperparameters for Random Forest
rf_params = {
    'n_estimators': [50, 100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'bootstrap': [True, False],  # Whether to bootstrap samples when building trees
    'criterion': ['gini', 'entropy'],  # Criterion to measure split quality
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider
}


In [22]:
# Logistic Regression Randomized Search
random_search_log_reg = RandomizedSearchCV(log_reg, log_reg_params, n_iter=50, cv=5, verbose=1, random_state=42, n_jobs=-1)
random_search_log_reg.fit(X_train, y_train)

# Random Forest Randomized Search
random_search_rf = RandomizedSearchCV(rf_model, rf_params, n_iter=50, cv=5, verbose=1, random_state=42, n_jobs=-1)
random_search_rf.fit(X_train, y_train)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


100 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solve

Fitting 5 folds for each of 50 candidates, totalling 250 fits


75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packag

In [23]:
# Logistic Regression Grid Search
grid_search_log_reg = GridSearchCV(log_reg, log_reg_params, cv=5, verbose=1, n_jobs=-1)
grid_search_log_reg.fit(X_train, y_train)

# Random Forest Grid Search
grid_search_rf = GridSearchCV(rf_model, rf_params, cv=5, verbose=1, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)


Fitting 5 folds for each of 720 candidates, totalling 3600 fits


1500 fits failed out of a total of 3600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.so

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


2880 fits failed out of a total of 8640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1432 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Naim\credit-scoring-model-development\venv\Lib\site-p

In [24]:
# Best Parameters
print("Best Parameters for Logistic Regression:", random_search_log_reg.best_params_)
print("Best Parameters for Random Forest:", random_search_rf.best_params_)

# Best Models
best_log_reg = random_search_log_reg.best_estimator_
best_rf = random_search_rf.best_estimator_

# Predictions
y_pred_log_reg = best_log_reg.predict(X_test)
y_pred_rf = best_rf.predict(X_test)

# Classification Reports and Accuracy
from sklearn.metrics import classification_report, accuracy_score

print("Logistic Regression Classification Report after Hyperparameter Tuning:")
print(classification_report(y_test, y_pred_log_reg))
print("Logistic Regression Accuracy after Hyperparameter Tuning:", accuracy_score(y_test, y_pred_log_reg))

print("\nRandom Forest Classification Report after Hyperparameter Tuning:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy after Hyperparameter Tuning:", accuracy_score(y_test, y_pred_rf))


Best Parameters for Logistic Regression: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 300, 'C': 1438.44988828766}
Best Parameters for Random Forest: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'entropy', 'bootstrap': True}
Logistic Regression Classification Report after Hyperparameter Tuning:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       374
           1       1.00      1.00      1.00       375

    accuracy                           1.00       749
   macro avg       1.00      1.00      1.00       749
weighted avg       1.00      1.00      1.00       749

Logistic Regression Accuracy after Hyperparameter Tuning: 0.9986648865153538

Random Forest Classification Report after Hyperparameter Tuning:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       374
           1       1.00      0.99      

In [25]:
rfms_df.columns

Index(['CustomerId', 'TransactionId', 'TransactionStartTime', 'Amount',
       'Frequency_Score', 'Monetary_Score', 'Recency_Score', 'RFMS_Score',
       'Default_Indicator'],
      dtype='object')

In [26]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Predictions
y_pred_log_reg_proba = log_reg.predict_proba(X_test)[:, 1]  # Probability estimates for Logistic Regression
y_pred_rf_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability estimates for Random Forest

# Compute ROC-AUC scores
roc_auc_log_reg = roc_auc_score(y_test, y_pred_log_reg_proba)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf_proba)

print(f"Logistic Regression ROC-AUC: {roc_auc_log_reg}")
print(f"Random Forest ROC-AUC: {roc_auc_rf}")

# Plot ROC Curve
fpr_log_reg, tpr_log_reg, _ = roc_curve(y_test, y_pred_log_reg_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr_log_reg, tpr_log_reg, label=f'Logistic Regression (AUC = {roc_auc_log_reg:.2f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_rf:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Random chance line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


Logistic Regression ROC-AUC: 0.9996149732620321
Random Forest ROC-AUC: 0.9999857397504456


  plt.show()


In [27]:
import joblib

# Save logistic regression model
joblib.dump(log_reg, "logistic_regression_model.pkl")

# Save random forest model
joblib.dump(rf_model, "random_forest_model.pkl")


['random_forest_model.pkl']

In [28]:
import joblib
# Save the feature names used for training
joblib.dump(list(X_train.columns), "feature_columns.pkl")


['feature_columns.pkl']