In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Dev_data_to_be_shared.csv')

df.head()

Unnamed: 0,account_number,bad_flag,onus_attribute_1,transaction_attribute_1,transaction_attribute_2,transaction_attribute_3,transaction_attribute_4,transaction_attribute_5,transaction_attribute_6,transaction_attribute_7,...,bureau_enquiry_47,bureau_enquiry_48,bureau_enquiry_49,bureau_enquiry_50,onus_attribute_43,onus_attribute_44,onus_attribute_45,onus_attribute_46,onus_attribute_47,onus_attribute_48
0,1,0,,,,,,,,,...,0.0,0.0,0.0,1.0,,,,,,
1,2,0,221000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,,,,,,
3,4,0,86000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,30.0,,,,,,
4,5,0,215000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,,,,,,


In [2]:
# Data Overview
# Check the shape of the dataset and data types
df_shape = df.shape
df_dtypes = df.dtypes

# Display the shape and data types
df_shape, df_dtypes.head()

((96806, 1216),
 account_number               int64
 bad_flag                     int64
 onus_attribute_1           float64
 transaction_attribute_1    float64
 transaction_attribute_2    float64
 dtype: object)

The dataset contains **96,806 rows** and **1,216 columns**. The first few columns have the following data types:

- `account_number`: int64
- `bad_flag`: int64
- `onus_attribute_1`: float64
- `transaction_attribute_1`: float64
- `transaction_attribute_2`: float64

Next, let's identify columns with missing values and their proportions.

In [3]:
# Calculate the percentage of missing values in each column
missing_values = df.isnull().mean() * 100

# Filter columns with missing values
missing_values = missing_values[missing_values > 0]

# Sort by percentage of missing values
df_missing_values = missing_values.sort_values(ascending=False)

df_missing_values.head(10)

bureau_447           100.000000
bureau_436           100.000000
bureau_449            94.124331
bureau_148            93.552053
bureau_448            90.028511
onus_attribute_45     88.006942
onus_attribute_44     88.006942
onus_attribute_43     88.006942
onus_attribute_48     88.006942
onus_attribute_47     88.006942
dtype: float64

In [4]:
# Descriptive statistics for numerical columns
descriptive_stats = df.describe()
descriptive_stats

Unnamed: 0,account_number,bad_flag,onus_attribute_1,transaction_attribute_1,transaction_attribute_2,transaction_attribute_3,transaction_attribute_4,transaction_attribute_5,transaction_attribute_6,transaction_attribute_7,...,bureau_enquiry_47,bureau_enquiry_48,bureau_enquiry_49,bureau_enquiry_50,onus_attribute_43,onus_attribute_44,onus_attribute_45,onus_attribute_46,onus_attribute_47,onus_attribute_48
count,96806.0,96806.0,71575.0,71575.0,71575.0,71575.0,71575.0,71575.0,71575.0,71575.0,...,94212.0,94212.0,94212.0,94212.0,11610.0,11610.0,11610.0,11610.0,11610.0,11610.0
mean,48403.5,0.014173,154239.1,9.570769,0.002207,4.092854,77.306435,0.006315,67.04355,0.081034,...,0.0,0.189657,0.044283,7.800673,0.588114,1.497158,0.54565,1.430491,0.121447,0.119208
std,27945.629417,0.118203,172992.5,1513.967595,0.11924,301.580599,3164.987013,0.129545,2516.330899,14.203615,...,0.0,0.597298,0.369451,8.655149,1.059613,1.702795,1.055865,1.72895,0.601256,0.599697
min,1.0,0.0,25000.0,0.0,0.0,0.0,-109800.4766,0.0,-3498.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24202.25,0.0,59000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0
50%,48403.5,0.0,100000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,1.0,0.0,1.0,0.0,0.0
75%,72604.75,0.0,181000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,11.0,1.0,2.0,1.0,2.0,0.0,0.0
max,96806.0,1.0,2800000.0,398936.791,25.0,55000.0,358986.0,12.0,358986.0,3150.0,...,0.0,18.0,14.0,102.0,19.0,38.0,19.0,38.0,15.0,15.0


In [5]:
# Calculate correlation matrix
correlation_matrix = df.corr()

# Extract correlation of `bad_flag` with other features
bad_flag_correlation = correlation_matrix['bad_flag'].sort_values(ascending=False)

# Display the top 10 features most correlated with `bad_flag`
bad_flag_correlation.head(10)

bad_flag             1.000000
onus_attribute_2     0.108491
onus_attribute_17    0.103156
onus_attribute_23    0.100748
onus_attribute_20    0.100672
onus_attribute_26    0.098549
onus_attribute_32    0.096320
onus_attribute_29    0.093453
onus_attribute_33    0.092944
onus_attribute_35    0.090654
Name: bad_flag, dtype: float64

In [6]:
# Feature Engineering
# Example: Create a new feature that is the sum of some of the onus attributes
# This is just an example; in practice, you would create features based on domain knowledge

df['onus_sum'] = df[['onus_attribute_1', 'onus_attribute_2', 'onus_attribute_3']].sum(axis=1)

# Check the new feature
new_feature_stats = df['onus_sum'].describe()
new_feature_stats

count    9.680600e+04
mean     1.140393e+05
std      1.634346e+05
min     -8.487496e-01
25%      9.211882e-01
50%      6.900000e+04
75%      1.450000e+05
max      2.800001e+06
Name: onus_sum, dtype: float64

In [4]:
import pandas as pd

# Reload the dataset
df = pd.read_csv('Dev_data_to_be_shared.csv')

df.head()

Unnamed: 0,account_number,bad_flag,onus_attribute_1,transaction_attribute_1,transaction_attribute_2,transaction_attribute_3,transaction_attribute_4,transaction_attribute_5,transaction_attribute_6,transaction_attribute_7,...,bureau_enquiry_47,bureau_enquiry_48,bureau_enquiry_49,bureau_enquiry_50,onus_attribute_43,onus_attribute_44,onus_attribute_45,onus_attribute_46,onus_attribute_47,onus_attribute_48
0,1,0,,,,,,,,,...,0.0,0.0,0.0,1.0,,,,,,
1,2,0,221000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,,,,,,
3,4,0,86000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,30.0,,,,,,
4,5,0,215000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,,,,,,


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# Prepare the data
X = df.drop(columns=['bad_flag'])
y = df['bad_flag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

accuracy, classification_report_str

Parameters: { "use_label_encoder" } are not used.



(0.9859002169197397,
 '              precision    recall  f1-score   support\n\n           0       0.99      1.00      0.99     19092\n           1       0.29      0.01      0.01       270\n\n    accuracy                           0.99     19362\n   macro avg       0.64      0.50      0.50     19362\nweighted avg       0.98      0.99      0.98     19362\n')

In [6]:
from catboost import CatBoostClassifier

# Define the CatBoost model
catboost_model = CatBoostClassifier(verbose=0, random_seed=42)

# Fit the model
catboost_model.fit(X_train, y_train)

# Predict on the test set
y_pred_catboost = catboost_model.predict(X_test)

# Calculate accuracy and classification report
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
classification_report_catboost = classification_report(y_test, y_pred_catboost)

accuracy_catboost, classification_report_catboost

ModuleNotFoundError: No module named 'catboost'

In [7]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable


Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)


Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)




Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.7 MB[0m [31m?[0m eta [36m-:--:--[0m

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/98.7 MB[0m [31m18.5 MB/s[0m eta [36m0:00:06[0m[2K   [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/98.7 MB[0m [31m100.8 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/98.7 MB[0m [31m232.4 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/98.7 MB[0m [31m168.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/98.7 MB[0m [31m132.3 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.4/98.7 MB[0m [31m169.4 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.4/98.7 MB[0m [31m169.4 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.4/98.7 MB[0m [31m169.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.0/98.7 MB[0m [31m62.6 MB/s[0m eta [36m0:00:02[0m

[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.4/98.7 MB[0m [31m62.5 MB/s[0m eta [36m0:00:02[0m

[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/98.7 MB[0m [31m56.7 MB/s[0m eta [36m0:00:02[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/98.7 MB[0m [31m212.7 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m55.5/98.7 MB[0m [31m225.1 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m63.5/98.7 MB[0m [31m229.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m70.9/98.7 MB[0m [31m218.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m76.5/98.7 MB[0m [31m180.0 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m83.3/98.7 MB[0m [31m185.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m91.2/98.7 MB[0m [31m222.6 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m98.7/98.7 MB[0m [31m226.5 MB/s[0m eta [36m0:00:01[0m

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

Installing collected packages: graphviz, catboost


Successfully installed catboost-1.2.7 graphviz-0.20.3



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
from catboost import CatBoostClassifier

# Define the CatBoost model
catboost_model = CatBoostClassifier(verbose=0)

# Fit the model
catboost_model.fit(X_train, y_train)

# Predict on the test set
y_pred = catboost_model.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

accuracy, classification_report_str

(0.9860035120338808,
 '              precision    recall  f1-score   support\n\n           0       0.99      1.00      0.99     19092\n           1       0.00      0.00      0.00       270\n\n    accuracy                           0.99     19362\n   macro avg       0.49      0.50      0.50     19362\nweighted avg       0.97      0.99      0.98     19362\n')