# ING Lion's Den 2024

## Team: RiskBusters

Michał Bryzik, Michał Niegierewicz, Kacper Gruca, Jan Ślusarek

In this file we clean the testing dataframe for continuous variables.

Due to the fact that there might be some anomalies in testing dataset we decided to have additional jupyter file to notice, analyze and report them.

In [85]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from datetime import date

In [86]:
data=pd.read_csv('../data/input/testing_sample.csv') 
data.head() 

Unnamed: 0,ID,customer_id,application_date,target,Application_status,Var1,Var2,Var3,Var4,Var5,...,Var22,Var23,Var24,Var25,Var26,Var27,Var28,Var29,Var30,_r_
0,36034977,32653719,03Feb2010 0:00:00,0.0,Approved,1,3.0,1.0,4800,15,...,8,8,0,4977.87,14880.28,0,0,70,3899,0.485674
1,36034978,32832365,04Feb2010 0:00:00,0.0,Approved,2,2.0,1.0,6800,18,...,1,1,29400,7922.9,6534.84,0,0,0,3899,0.267045
2,36034979,32544742,07Feb2010 0:00:00,0.0,Approved,1,3.0,1.0,4600,18,...,0,0,0,3174.15,16974.69,0,0,0,3899,0.793579
3,36034980,32395830,09Feb2010 0:00:00,,Rejected,2,2.0,2.0,7900,48,...,2,2,0,4972.82,3484.36,0,0,10,3899,0.53304
4,36034981,32592943,13Feb2010 0:00:00,0.0,Approved,2,1.0,1.0,9900,63,...,3,3,0,2278.22,9630.53,0,0,10,3899,0.781269


In [87]:
data.shape 

(5000, 36)

In [88]:
data.isnull().sum() 

ID                       0
customer_id              0
application_date         0
target                1353
Application_status       0
Var1                     0
Var2                   139
Var3                   139
Var4                     0
Var5                     0
Var6                     0
Var7                     0
Var8                  2905
Var9                     0
Var10                 3719
Var11                    0
Var12                 3719
Var13                    0
Var14                    0
Var15                    0
Var16                    0
Var17                    0
Var18                 3721
Var19                 2905
Var20                    0
Var21                    0
Var22                    0
Var23                    0
Var24                    0
Var25                 1023
Var26                 1961
Var27                    0
Var28                    0
Var29                    0
Var30                    0
_r_                      0
dtype: int64

In [89]:
data.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   customer_id         5000 non-null   int64  
 2   application_date    5000 non-null   object 
 3   target              3647 non-null   float64
 4   Application_status  5000 non-null   object 
 5   Var1                5000 non-null   int64  
 6   Var2                4861 non-null   float64
 7   Var3                4861 non-null   float64
 8   Var4                5000 non-null   int64  
 9   Var5                5000 non-null   int64  
 10  Var6                5000 non-null   int64  
 11  Var7                5000 non-null   float64
 12  Var8                2095 non-null   float64
 13  Var9                5000 non-null   int64  
 14  Var10               1281 non-null   float64
 15  Var11               5000 non-null   int64  
 16  Var12 

## Clean dataset

### Define columns by datetypes and sort df

We define the target type of the column basing on the description file and exploring data

In [90]:
# Define columns
date_columns = ['application_date', 'Var13'] 
numeric_columns = ['ID', 'customer_id', '_r_', 'Var9', 'Var10']  
categorical_columns = []
binary_columns = ['target', 'Application_status'] 

In [91]:
# Checking the data type for the rest of columns 
for column in data.columns:
    if column in date_columns + numeric_columns + binary_columns:
        continue  
    elif column in ['Var1', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Var15', 'Var16', 'Var17', 'Var20', 'Var21', 'Var22', 'Var23', 'Var24', 'Var25', 'Var26', 'Var29', 'Var30']:
        numeric_columns.append(column)
    elif column in ['Var2', 'Var3', 'Var11', 'Var12', 'Var14']:
        categorical_columns.append(column)
    elif column in ['Var18', 'Var19', 'Var27', 'Var28']:
        binary_columns.append(column)

In [92]:
print(date_columns)
print(numeric_columns)
print(categorical_columns)
print(binary_columns)

['application_date', 'Var13']
['ID', 'customer_id', '_r_', 'Var9', 'Var10', 'Var1', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Var15', 'Var16', 'Var17', 'Var20', 'Var21', 'Var22', 'Var23', 'Var24', 'Var25', 'Var26', 'Var29', 'Var30']
['Var2', 'Var3', 'Var11', 'Var12', 'Var14']
['target', 'Application_status', 'Var18', 'Var19', 'Var27', 'Var28']


In [93]:
# Make a copy of main data, from this moment we will work on this dataset
df = data.copy()

In [94]:
# Assumed sequence for columns sorting 
sorted_columns = date_columns + numeric_columns + categorical_columns + binary_columns

# Choose of the sorted value by defined categories
df = data[sorted_columns]

In [95]:
df.head()
# Some of the records for Var14 are Nulls, it requires analysis

Unnamed: 0,application_date,Var13,ID,customer_id,_r_,Var9,Var10,Var1,Var4,Var5,...,Var3,Var11,Var12,Var14,target,Application_status,Var18,Var19,Var27,Var28
0,03Feb2010 0:00:00,28Nov2004,36034977,32653719,0.485674,4200,,1,4800,15,...,1.0,2,,0,0.0,Approved,,,0,0
1,04Feb2010 0:00:00,19Sep1996,36034978,32832365,0.267045,5880,3700.0,2,6800,18,...,1.0,5,6.0,3,0.0,Approved,1.0,,0,0
2,07Feb2010 0:00:00,23Nov1998,36034979,32544742,0.793579,4200,,1,4600,18,...,1.0,5,,0,0.0,Approved,,,0,0
3,09Feb2010 0:00:00,13Nov2005,36034980,32395830,0.53304,4200,5400.0,2,7900,48,...,2.0,5,5.0,2,,Rejected,0.0,,0,0
4,13Feb2010 0:00:00,06Aug1995,36034981,32592943,0.781269,3120,3300.0,2,9900,63,...,1.0,4,5.0,1,0.0,Approved,,0.0,0,0


### Handle date variables

In [96]:
# Print date columns
date_columns

['application_date', 'Var13']

#### Change the type of columns

In [97]:
# Change date columns to datetime type
df['application_date'] = pd.to_datetime(df['application_date'], errors='coerce')
df['Var13'] = pd.to_datetime(df['Var13'], format='%d%b%Y', errors='coerce')         # Var13 - irregular date format

In [98]:
# Check naN values after transformation
print(df['Var13'].isnull().sum(0))
print(df['application_date'].isnull().sum(0))

49
0


After transformation some records for Var13 are empty, we need to analyze it

In [99]:
df.head()

Unnamed: 0,application_date,Var13,ID,customer_id,_r_,Var9,Var10,Var1,Var4,Var5,...,Var3,Var11,Var12,Var14,target,Application_status,Var18,Var19,Var27,Var28
0,2010-02-03,2004-11-28,36034977,32653719,0.485674,4200,,1,4800,15,...,1.0,2,,0,0.0,Approved,,,0,0
1,2010-02-04,1996-09-19,36034978,32832365,0.267045,5880,3700.0,2,6800,18,...,1.0,5,6.0,3,0.0,Approved,1.0,,0,0
2,2010-02-07,1998-11-23,36034979,32544742,0.793579,4200,,1,4600,18,...,1.0,5,,0,0.0,Approved,,,0,0
3,2010-02-09,2005-11-13,36034980,32395830,0.53304,4200,5400.0,2,7900,48,...,2.0,5,5.0,2,,Rejected,0.0,,0,0
4,2010-02-13,1995-08-06,36034981,32592943,0.781269,3120,3300.0,2,9900,63,...,1.0,4,5.0,1,0.0,Approved,,0.0,0,0


#### Analyze Var13

In [100]:
# Check indexes where conversion caused 'NaT' values
nan_indices = df['Var13'].isna()

# Compare original values of 'Var13' which after conversion became 'NaT'
original_values_with_nan = data.loc[nan_indices, 'Var13']

original_values_with_nan

19      31Dec9999
78      31Dec9999
154     31Dec9999
284     31Dec9999
350     31Dec9999
375     31Dec9999
380     31Dec9999
548     31Dec9999
820     31Dec9999
833     31Dec9999
999     31Dec9999
1143    31Dec9999
1166    31Dec9999
1208    31Dec9999
1346    31Dec9999
1447    31Dec9999
1487    31Dec9999
1514    31Dec9999
1687    31Dec9999
2142    31Dec9999
2338    31Dec9999
2361    31Dec9999
2374    31Dec9999
2415    31Dec9999
2527    31Dec9999
2555    31Dec9999
2754    31Dec9999
2854    31Dec9999
3080    31Dec9999
3212    31Dec9999
3289    31Dec9999
3343    31Dec9999
3355    31Dec9999
3569    31Dec9999
3575    31Dec9999
3608    31Dec9999
3706    31Dec9999
3965    31Dec9999
4056    31Dec9999
4096    31Dec9999
4225    31Dec9999
4267    31Dec9999
4348    31Dec9999
4485    31Dec9999
4584    31Dec9999
4641    31Dec9999
4677    31Dec9999
4753    31Dec9999
4988    31Dec9999
Name: Var13, dtype: object

In [101]:
# Check unique values of 'Var13' which after conversion became 'NaT'
original_values_with_nan.unique()

array(['31Dec9999'], dtype=object)

Summarizing we have 784 cases in Var13 with value '31Dec9999'

We treat them as data error and for this moment leave with NAs

#### Create working_months and NMOB column

Var13 = employment date

working_moths = Var13 - application_date


We would like to calculate the number of working months before applying for credit, which will indicate the employee's duration at the given position from employment to loan application.

NAs from Var13 are replaced with 0

We also want to have this column first

In [102]:
df.isna().sum()

application_date         0
Var13                   49
ID                       0
customer_id              0
_r_                      0
Var9                     0
Var10                 3719
Var1                     0
Var4                     0
Var5                     0
Var6                     0
Var7                     0
Var8                  2905
Var15                    0
Var16                    0
Var17                    0
Var20                    0
Var21                    0
Var22                    0
Var23                    0
Var24                    0
Var25                 1023
Var26                 1961
Var29                    0
Var30                    0
Var2                   139
Var3                   139
Var11                    0
Var12                 3719
Var14                    0
target                1353
Application_status       0
Var18                 3721
Var19                 2905
Var27                    0
Var28                    0
dtype: int64

In [103]:
# Create a new colum working_months
df['working_months'] = ((df['application_date'] - df['Var13']) / np.timedelta64(1, 'M')).fillna(0).astype(int)

In [104]:
# Write current date to the variable
today = pd.Timestamp(date.today())

# Create MOB column
df['MOB'] = ((today - df['application_date']) / np.timedelta64(1, 'M')).astype(int)

In [105]:
# Delete columns 'Var13' and 'application_date'
df.drop(['Var13'], axis=1, inplace=True)
df.drop(['application_date'], axis=1, inplace=True)

In [106]:
df.head()

Unnamed: 0,ID,customer_id,_r_,Var9,Var10,Var1,Var4,Var5,Var6,Var7,...,Var12,Var14,target,Application_status,Var18,Var19,Var27,Var28,working_months,MOB
0,36034977,32653719,0.485674,4200,,1,4800,15,1,463.31,...,,0,0.0,Approved,,,0,0,62,169
1,36034978,32832365,0.267045,5880,3700.0,2,6800,18,6,3170.4,...,6.0,3,0.0,Approved,1.0,,0,0,160,169
2,36034979,32544742,0.793579,4200,,1,4600,18,1,355.36,...,,0,0.0,Approved,,,0,0,134,169
3,36034980,32395830,0.53304,4200,5400.0,2,7900,48,1,231.34,...,5.0,2,,Rejected,0.0,,0,0,50,169
4,36034981,32592943,0.781269,3120,3300.0,2,9900,63,3,639.45,...,5.0,1,0.0,Approved,,0.0,0,0,174,168


Move 'working_months' and MOB to the first column

In [107]:
# Create a list of columns without 'working_months'
df_sorted_columns = df.columns.tolist()

# Remove 'working_months' and 'MOB' from the list
columns_to_remove = ['working_months', 'MOB']
df_sorted_columns = [col for col in df_sorted_columns if col not in columns_to_remove]

# Add 'working_months' to the first position
df_sorted_columns.insert(0, 'working_months')
df_sorted_columns.insert(1, 'MOB')  # Insert 'MOB' after 'working_months'

# Reindex columns
df = df[df_sorted_columns]

In [108]:
# Update the list of date and numeric columns
date_columns = []
numeric_columns = numeric_columns + ['working_months', 'MOB']

In [109]:
df.head()

Unnamed: 0,working_months,MOB,ID,customer_id,_r_,Var9,Var10,Var1,Var4,Var5,...,Var3,Var11,Var12,Var14,target,Application_status,Var18,Var19,Var27,Var28
0,62,169,36034977,32653719,0.485674,4200,,1,4800,15,...,1.0,2,,0,0.0,Approved,,,0,0
1,160,169,36034978,32832365,0.267045,5880,3700.0,2,6800,18,...,1.0,5,6.0,3,0.0,Approved,1.0,,0,0
2,134,169,36034979,32544742,0.793579,4200,,1,4600,18,...,1.0,5,,0,0.0,Approved,,,0,0
3,50,169,36034980,32395830,0.53304,4200,5400.0,2,7900,48,...,2.0,5,5.0,2,,Rejected,0.0,,0,0
4,174,168,36034981,32592943,0.781269,3120,3300.0,2,9900,63,...,1.0,4,5.0,1,0.0,Approved,,0.0,0,0


### Handle binary variables

Convert the 'Application_status' column to binary variables to facilitate analysis. Applications marked as 'Approved' are changed to 1, and those marked as 'Rejected' are changed to 0. This conversion simplifies the analysis process, allowing for straightforward interpretation of application outcomes

In [110]:
# Check unique values for binary columns
for column in binary_columns:
    print(f"Unique values for column '{column}': {df[column].unique()}")

Unique values for column 'target': [ 0. nan  1.]
Unique values for column 'Application_status': ['Approved' 'Rejected']
Unique values for column 'Var18': [nan  1.  0.]
Unique values for column 'Var19': [nan  0.  1.]
Unique values for column 'Var27': [0 1]
Unique values for column 'Var28': [0 1]


As we can observe column "Application_status" needs to be modified

In [111]:
# Change value "Approved" to 1 and "Rejected" to 0 in column 'Application_status'
df['Application_status'] = df['Application_status'].replace({'Approved': 1, 'Rejected': 0})

In [112]:
# Check and count unique values for 'Application_status' to ensure that transformation was successful
df['Application_status'].value_counts(dropna=False)

1    3647
0    1353
Name: Application_status, dtype: int64

Check the rest of binary columns

In [113]:
df[binary_columns].isna().sum()

target                1353
Application_status       0
Var18                 3721
Var19                 2905
Var27                    0
Var28                    0
dtype: int64

In [114]:
df[binary_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   target              3647 non-null   float64
 1   Application_status  5000 non-null   int64  
 2   Var18               1279 non-null   float64
 3   Var19               2095 non-null   float64
 4   Var27               5000 non-null   int64  
 5   Var28               5000 non-null   int64  
dtypes: float64(3), int64(3)
memory usage: 234.5 KB


We can observe that we have some nulls in binary variables, but we will handle them later.

### Handle categorical variables

We have 5 categorical columns:

* Var2 - Loan purpose
* Var3 - Distribution channel
* Var11 - profession of main applicant
* Var12 - profession of second applicant
* Var14 - marital status of main applicant

All of them are treated as simple categorical variables(non-ordered)

In [115]:
# Check unique values for binary columns
for column in categorical_columns:
    print(f"Unique values for column '{column}': {df[column].unique()}")

Unique values for column 'Var2': [ 3.  2.  1. nan]
Unique values for column 'Var3': [ 1.  2. nan  3.]
Unique values for column 'Var11': [2 5 4 6 3 7 1]
Unique values for column 'Var12': [nan  6.  5.  3.  4.  2.  1.  7.]
Unique values for column 'Var14': [0 3 2 1 4]


According to the variables_description.xlsx file the variable Var2 should have the following values:

* 1 Direct
* 2 Broker
* 3 Online

In the dataset this variable has 5 unique values printed above. First of all we should transform 'Direct' to 1 and 'Online' to 3

In [116]:
# Check the number of observations in Var3 for each category
df['Var3'].value_counts(dropna=False)

1.0    2454
2.0    1645
3.0     762
NaN     139
Name: Var3, dtype: int64

In [117]:
# Change the value "Direct" to 1 and "Online" to 3 in column "Var3"
df['Var3'] = df['Var3'].replace({'Direct': 1, 'Online': 3})

# Change the data type of column "Var3" to float (we still have NaN values)
df['Var3'] = df['Var3'].astype(float)

Check the unique values after transformation

In [118]:
# Check again the number of observations in Var3 for each category
df['Var3'].value_counts(dropna=False)

1.0    2454
2.0    1645
3.0     762
NaN     139
Name: Var3, dtype: int64

It looks that the transformation was successfull

In [119]:
# Count NA values for categorical columns
df[categorical_columns].isna().sum()

Var2      139
Var3      139
Var11       0
Var12    3719
Var14       0
dtype: int64

All categorical variables are ready for the further processing (filling NA values and one hot encoding).

### Handle NaN in target variable

As we can observe below we have 13282 NaN values in our target variable.

We found out that all of them are for "Application status" == rejected (1).

We analyze the approach to fill target variable of all rejected application to 1 (default). However this way would bring in a high conlusion that all of rejected people would default in the future and it would impact negatively on our model. Additionally we assume that we want to model the approved applications only.

After a long discussion we decided to delete all of rows with Application_status == 0.

In [120]:
# Check the number of NAa for target variable
df['target'].isna().sum()

1353

In [121]:
df.loc[df["Application_status"] == 0, "target"].isna().sum()

1353

In [122]:
# Drop rows where 'Application_status' == 0
df = df[df['Application_status'] != 0]

Now as we know that there are only rows with application_status == 0 we can delete this column from df

In [123]:
# Drop column 'Application_status'
df.drop(['Application_status'], axis=1, inplace=True)

In [124]:
# Delete "application_stauts" from binary_columns
binary_columns.remove('Application_status')

### Handle numeric data

In [125]:
df[numeric_columns].isna().sum()

ID                   0
customer_id          0
_r_                  0
Var9                 0
Var10             2757
Var1                 0
Var4                 0
Var5                 0
Var6                 0
Var7                 0
Var8              2030
Var15                0
Var16                0
Var17                0
Var20                0
Var21                0
Var22                0
Var23                0
Var24                0
Var25              722
Var26             1415
Var29                0
Var30                0
working_months       0
MOB                  0
dtype: int64

#### Dependence of numeric variables with categorical variables

Var10 and Var8 are strictly related to Var1 and Var2 respectively, so we wanted to check the NAs in dependent variables.
However Var1 related to Var10 does not have any missings we moved to Var8 and Var2.

We wanted to check if there are any NA observations of Var2 (Loan purpose) having Var8 (Value of goods(car)) set to some value. If yes we assume that the value is entered correctly and we are missing Var2 for some unknown reason.

We decided to fill these rows with value 1 (meaning Car Loan).

As we can observe below we have 1018 NAs before filling the data.

Var18 and Var 19 are also strictly related to Var2 and we apply similiar approach.

In [126]:
# Count values inside Var8 group by value
df['Var2'].value_counts(dropna=False)

1.0    1575
3.0    1011
2.0     960
NaN     101
Name: Var2, dtype: int64

In [127]:
# Put value 1 to the column Var2 if Var8 is not NaN
df.loc[~df['Var8'].isna(), 'Var2'] = 1

# Put value 1 to the column Var2 if Var19 is not NaN
df.loc[~df['Var19'].isna(), 'Var2'] = 1

In [128]:
df['Var2'].value_counts(dropna=False)

1.0    1617
3.0    1011
2.0     960
NaN      59
Name: Var2, dtype: int64

In [129]:
# Put value 2 to the column Var2 if Var18 is not NaN
df.loc[~df['Var18'].isna(), 'Var2'] = 2

After transformation the number of NaN in Var8 dropped by almost half

In [130]:
# count values inside Var8 group by value
df['Var2'].value_counts(dropna=False)

1.0    1617
3.0    1011
2.0     991
NaN      28
Name: Var2, dtype: int64

#### Analyze Var9 and Var10

We have 28043 null values in Var10 (income of second applicant). We would like to check how many of these NAs is because of the fact that there is just one applicant.

As we can see below all of NAs are inside this group. We decided to sum these columns into one and name it "income".

In [131]:
# Count the number of nulls where Var1 = 1
df.loc[df["Var1"] == 1, "Var10"].isna().sum()

2757

In [132]:
# Create a new colum income
df["income"] = df["Var9"] + df["Var10"].fillna(0).astype(int) # Var9 was as integer, Var10 was as float, but we do not have NAs

numeric_columns.append("income")

In [133]:
# Drop columns Var10
df.drop(['Var10'], axis=1, inplace=True)

# Update the list of numeric columns
numeric_columns.remove('Var10')

Check again the number of null values

In [134]:
df[numeric_columns].isna().sum()

ID                   0
customer_id          0
_r_                  0
Var9                 0
Var1                 0
Var4                 0
Var5                 0
Var6                 0
Var7                 0
Var8              2030
Var15                0
Var16                0
Var17                0
Var20                0
Var21                0
Var22                0
Var23                0
Var24                0
Var25              722
Var26             1415
Var29                0
Var30                0
working_months       0
MOB                  0
income               0
dtype: int64

### From here the code for continuous variables will differ

# Zmiana - kategoryczne ciągłe

Var8, Var25, Var26 - variables which are changed to categorical but have NAs

We still have a few variables with nulls:

* Var8 - Value of the goods (car) 
* Var17 - Spendings estimation - the number of NAs is low, there is a sense in filling this, we decided to use KNN to fill these variables.
* Var25 - Amount on current account 
* Var26 - Amount on savings account

When it comes to Var25 and Var26 we decidec to fill NAa with 0. NAs means probably that the client does not have account in our bank and we cannot reach that value or does not have any money on either current or saving account. In both cases there is an element of uncertainty which we decided to minimize.

Because of this our model should be more conservative and penalize clients about whom we are not sure.

In [135]:
# Check the percentage number of NAs in Var8 related to whole df
round(df['Var8'].isna().sum() / df.shape[0] * 100, 2)

55.66

In [136]:
df[numeric_columns].isna().sum()

ID                   0
customer_id          0
_r_                  0
Var9                 0
Var1                 0
Var4                 0
Var5                 0
Var6                 0
Var7                 0
Var8              2030
Var15                0
Var16                0
Var17                0
Var20                0
Var21                0
Var22                0
Var23                0
Var24                0
Var25              722
Var26             1415
Var29                0
Var30                0
working_months       0
MOB                  0
income               0
dtype: int64

### Handle remaining NULL values

In [137]:
df.isnull().sum()

working_months       0
MOB                  0
ID                   0
customer_id          0
_r_                  0
Var9                 0
Var1                 0
Var4                 0
Var5                 0
Var6                 0
Var7                 0
Var8              2030
Var15                0
Var16                0
Var17                0
Var20                0
Var21                0
Var22                0
Var23                0
Var24                0
Var25              722
Var26             1415
Var29                0
Var30                0
Var2                28
Var3               101
Var11                0
Var12             2757
Var14                0
target               0
Var18             2656
Var19             2030
Var27                0
Var28                0
income               0
dtype: int64

* Var17 - spedning estimation - as we wrote before, we will fill with KNN
* Var2 - loan purpose [categorical] - drop NA rows
* Var3 - Distribution channel [categorical] - drop NA rows
* Var12 - profession of second applicant [categorical] - add new category
* Var18 - Property ownership for property renovation
* Var19 - Clasification of the vehicle (Car, Motorbike)

When it comes to the Var18 and Var19 we analyze combining them with Var2 (loan purpose) and have 1 categorical variable with values:

1. Car loan motorbike
2. Car loan car
3. House renovation your property
4. House renovation not your property
5. Short cash

We finally decided not to use this approach, however we provide you with the short analysis why.

Var18, Var19 - dodać kolejną kategorię

#### Var12

Check if all NAs in Var12 (profession of second applicant)are if Var1 == 1, so if there is just one applicant.

The number of NAs in this category is the same as the number of NAs when Var1 == 1, so we fill it with a new category.

In [138]:
df.loc[df["Var1"] == 1, "Var12"].isna().sum()

2757

Fill NA in Var12 (profession of second applicant) with new category: 8 - just one applicant

In [139]:
df["Var12"] = df["Var12"].fillna(8)

#### Var18 and Var19 - check

Check if we can combine Var2 with Var18 and Var19 and having one categorical variable instead of one vategorical and 2 binary with huge amount of nulls. The new variable would consists of 5 values:

1. Car loan motorbike
2. Car loan car
3. House renovation your property
4. House renovation not your property
5. Short cash

First of all we need to check the values - if combining them would reduce the nulls to 0 and nulls are related only to the variable specificity or we really have some missing data.

Var2 == 1 is related to Var 19 and Var2 == 2 is related to Var 18.

In [140]:
df.shape[0]

3647

In [141]:
# Check if the number of "conditional" values is the same as the number of rows in df

# Count rows where Var2 == 1 and Var19 is not NaN
var2_19 = df[(df['Var2'] == 1) & (~df['Var19'].isna())].shape[0]

# Count rows where Var2 == 2 and Var 18 is not NaN
var2_18 = df[(df['Var2'] == 2) & (~df['Var18'].isna())].shape[0]

# Count rows where Var2 == 3
var2_3 = df.loc[df["Var2"] == 3].shape[0]

# Calculate of sum
sum = var2_19 + var2_18 + var2_3

print(var2_18)
print(var2_19)
print(var2_3)
print(sum)

991
1617
1011
3619


The number of rows in df is 36718 while the "conditional" sum is 36136, meaning that there would be a new category or we would need to add original values from Var2 and the number of categories would increase.

Lets try to create this categorical column and see the distribution.

In [142]:
# Add new column loan_desc
df['loan_desc'] = 0

# Check conditions
df.loc[(df['Var2'] == 1) & (df['Var19'] == 1), 'loan_desc'] = 1
df.loc[(df['Var2'] == 1) & (df['Var19'] == 0), 'loan_desc'] = 2
df.loc[(df['Var2'] == 2) & (df['Var18'] == 1), 'loan_desc'] = 3
df.loc[(df['Var2'] == 2) & (df['Var18'] == 0), 'loan_desc'] = 4
df.loc[df['Var2'] == 3, 'loan_desc'] = 5

# Add 'loan_desc' to the list of categorical columns
categorical_columns.append('loan_desc')

In [143]:
df['loan_desc'].value_counts(dropna=False)

1    1183
5    1011
3     826
2     434
4     165
0      28
Name: loan_desc, dtype: int64

As we can observe above introducing new variables diveded the df into smaller, not very numerous groups. Vast majority of data are concentrated around 1, 5 and 3 group so probably (there was no precise description of variable Var18 and Var19) for:

 * Car loan for car
 * House renovation for client's property
 * Short cash

These new two groups (2, 4) and additional 0 constitute a small percentage of all df.

Summarizing we decide to go with variable loan_desc

In [144]:
df.shape

(3647, 36)

#### Drop Var18 i Var19 and Var2

In [145]:
# Drop Var18 and Var19 columns
df.drop('Var2', axis=1, inplace=True)
df.drop('Var18', axis=1, inplace=True)
df.drop('Var19', axis=1, inplace=True)

In [146]:
# Update the list of binary columns
categorical_columns.remove('Var2')
binary_columns.remove('Var18')
binary_columns.remove('Var19')

In [147]:
# Drop NA rows fof Var3
df = df[~df['Var3'].isna()]

In [148]:
# sum number of observations for each category in Var3
df['Var3'].value_counts(dropna=False)

1.0    1817
2.0    1240
3.0     489
Name: Var3, dtype: int64

In [149]:
df.isna().sum()

working_months       0
MOB                  0
ID                   0
customer_id          0
_r_                  0
Var9                 0
Var1                 0
Var4                 0
Var5                 0
Var6                 0
Var7                 0
Var8              1971
Var15                0
Var16                0
Var17                0
Var20                0
Var21                0
Var22                0
Var23                0
Var24                0
Var25              708
Var26             1382
Var29                0
Var30                0
Var3                 0
Var11                0
Var12                0
Var14                0
target               0
Var27                0
Var28                0
income               0
loan_desc            0
dtype: int64

Summarizing we still have 31 NAs in Var17 which will be filled using KNN

### Handling numeric/categorical NAs

Var8, Var25, Var26 - ciągła którą zamieniamy na kategoryczną + osobna kategoria na NA

In [150]:
# Create a copy of df to further analysis
df_copy = df.copy()

#### Var8

In [151]:
def quantile_cut_and_merge(df, column_name):
    # Filter out NaN values from the specified column
    df_filtered = df.dropna(subset=[column_name])
    
    # Perform quantile cut on filtered DataFrame
    df_filtered[column_name + '_q'] = pd.qcut(df_filtered[column_name], 10, labels=False, duplicates='drop')
    
    # Merge the results back into the original DataFrame
    df = df.merge(df_filtered[['ID', column_name + '_q']], how='left', on='ID')
    
    # Fill NaN values in the new quantile column with 10
    df[column_name + '_q'] = df[column_name + '_q'].fillna(10)
    
    # Drop the original column
    df.drop(column_name, axis=1, inplace=True)
    
    return df

In [152]:
for col in ['Var8', 'Var25', 'Var26']:
    df = quantile_cut_and_merge(df, col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[column_name + '_q'] = pd.qcut(df_filtered[column_name], 10, labels=False, duplicates='drop')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[column_name + '_q'] = pd.qcut(df_filtered[column_name], 10, labels=False, duplicates='drop')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

#### Handle categorical variables without NAs

In [153]:
# Continuous columns to be changed to categorical
columns = ['Var4', 'Var5', 'Var7', 'income', 'Var24', 'Var29', 'Var30']

for col in columns:
    # Perform quantile cut on filtered DataFrame
    df[col] = pd.qcut(df[col], 10, labels=False, duplicates='drop')

#### _r_ - drop

In [154]:
# Drop _r_ column
df.drop('_r_', axis=1, inplace=True)

In [155]:
df

Unnamed: 0,working_months,MOB,ID,customer_id,Var9,Var1,Var4,Var5,Var6,Var7,...,Var12,Var14,target,Var27,Var28,income,loan_desc,Var8_q,Var25_q,Var26_q
0,62,169,36034977,32653719,4200,1,1,1,1,1,...,8.0,0,0.0,0,0,0,5,10.0,2.0,3.0
1,160,169,36034978,32832365,5880,2,1,2,6,8,...,6.0,3,0.0,0,0,4,3,10.0,4.0,1.0
2,134,169,36034979,32544742,4200,1,0,2,1,0,...,8.0,0,0.0,0,0,0,5,10.0,1.0,3.0
3,174,168,36034981,32592943,3120,2,2,8,3,2,...,5.0,1,0.0,0,0,2,2,0.0,0.0,2.0
4,196,168,36034982,32601182,4440,2,4,6,1,2,...,5.0,0,0.0,0,0,2,2,0.0,1.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3541,184,38,36039969,32399390,19800,1,8,1,1,8,...,8.0,3,0.0,0,0,8,1,7.0,2.0,9.0
3542,207,38,36039970,32880524,18360,1,7,3,1,7,...,8.0,3,0.0,0,0,8,3,10.0,10.0,10.0
3543,4,38,36039972,32807051,9240,1,8,5,1,6,...,8.0,0,0.0,0,0,4,3,10.0,9.0,8.0
3544,86,38,36039973,32826431,18000,2,9,6,1,7,...,5.0,0,0.0,0,0,9,1,7.0,4.0,8.0


#### Zmienne problematyczne:

* Var20
* Var21
* Var22
* Var30

#### Var20

In [156]:
df['Var20'].value_counts(dropna=False)

0     2298
1      841
2      241
3       91
4       39
5       15
6       12
7        3
9        2
8        2
20       1
10       1
Name: Var20, dtype: int64

In [157]:
# create a new column Var20_1 filled basing on Var20
def process_var(var, df):
    df[var + '_1'] = 0
    df.loc[df[var].isin([1, 2]), var + '_1'] = 1
    df.loc[df[var].isin([3, 4, 5]), var + '_1'] = 2
    df.loc[df[var] > 5, var + '_1'] = 3
    return df

In [158]:
df = process_var('Var20', df)

In [159]:
df['Var20_1'].value_counts(dropna=False)

0    2298
1    1082
2     145
3      21
Name: Var20_1, dtype: int64

In [160]:
df.drop('Var20', axis=1, inplace=True)

#### Var 21

In [161]:
df['Var21'].value_counts(dropna=False)

0     1690
1      989
2      459
3      202
4      117
5       41
6       28
8        8
7        5
10       3
9        2
22       1
12       1
Name: Var21, dtype: int64

In [162]:
df = process_var('Var21', df)

In [163]:
df['Var21_1'].value_counts(dropna=False)

0    1690
1    1448
2     360
3      48
Name: Var21_1, dtype: int64

In [164]:
df.drop('Var21', axis=1, inplace=True)

#### Var22

In [165]:
df['Var22'].value_counts(dropna=False)

0     1303
1      987
2      615
3      296
4      169
5       81
6       46
7       21
8       13
10       6
9        4
22       1
13       1
11       1
12       1
14       1
Name: Var22, dtype: int64

In [166]:
df = process_var('Var22', df)

In [167]:
df['Var22_1'].value_counts(dropna=False)

1    1602
0    1303
2     546
3      95
Name: Var22_1, dtype: int64

In [168]:
df.drop('Var22', axis=1, inplace=True)

#### Var23

In [169]:
df['Var23'].value_counts(dropna=False)

0     1007
1      958
2      658
3      372
4      249
5      135
6       75
7       40
8       18
10      12
9       10
12       4
13       2
11       2
14       2
20       1
22       1
Name: Var23, dtype: int64

In [170]:
df = process_var('Var23', df)

In [171]:
df['Var23_1'].value_counts(dropna=False)

1    1616
0    1007
2     756
3     167
Name: Var23_1, dtype: int64

In [172]:
df.drop('Var23', axis=1, inplace=True)

#### Var30

Leave as is

In [173]:
df['Var30'].value_counts(dropna=False)

6    785
1    593
7    430
0    389
4    370
3    331
5    329
2    319
Name: Var30, dtype: int64

In [174]:
df.isna().sum()

working_months    0
MOB               0
ID                0
customer_id       0
Var9              0
Var1              0
Var4              0
Var5              0
Var6              0
Var7              0
Var15             0
Var16             0
Var17             0
Var24             0
Var29             0
Var30             0
Var3              0
Var11             0
Var12             0
Var14             0
target            0
Var27             0
Var28             0
income            0
loan_desc         0
Var8_q            0
Var25_q           0
Var26_q           0
Var20_1           0
Var21_1           0
Var22_1           0
Var23_1           0
dtype: int64

### Fill Var17

Fill values Var17 with values from first df (development_sample_cleaned_continuous)

In [175]:
df_cont = pd.read_csv('../data/output/countinuous_dfs/testing_sample_cleaned_continuous.csv')

In [176]:
# fill NAs in Var17 from df_nont by left join
df = df.merge(df_cont[['ID', 'Var17']], how='left', on='ID')

In [177]:
# Drop Var 17 from df
df.drop('Var17_x', axis=1, inplace=True)

In [178]:
# Rename Var17_y to Var17
df.rename(columns={'Var17_y': 'Var17'}, inplace=True)

### Change Var 17 to categorical

In [179]:
df

Unnamed: 0,working_months,MOB,ID,customer_id,Var9,Var1,Var4,Var5,Var6,Var7,...,income,loan_desc,Var8_q,Var25_q,Var26_q,Var20_1,Var21_1,Var22_1,Var23_1,Var17
0,62,169,36034977,32653719,4200,1,1,1,1,1,...,0,5,10.0,2.0,3.0,1,3,3,3,2436.83
1,160,169,36034978,32832365,5880,2,1,2,6,8,...,4,3,10.0,4.0,1.0,1,1,1,1,4430.26
2,134,169,36034979,32544742,4200,1,0,2,1,0,...,0,5,10.0,1.0,3.0,0,0,0,0,1695.92
3,174,168,36034981,32592943,3120,2,2,8,3,2,...,2,2,0.0,0.0,2.0,2,2,2,2,3785.22
4,196,168,36034982,32601182,4440,2,4,6,1,2,...,2,2,0.0,1.0,10.0,0,0,0,0,2720.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3541,184,38,36039969,32399390,19800,1,8,1,1,8,...,8,1,7.0,2.0,9.0,0,0,0,0,10295.46
3542,207,38,36039970,32880524,18360,1,7,3,1,7,...,8,3,10.0,10.0,10.0,0,2,2,2,8253.75
3543,4,38,36039972,32807051,9240,1,8,5,1,6,...,4,3,10.0,9.0,8.0,1,1,1,1,4263.92
3544,86,38,36039973,32826431,18000,2,9,6,1,7,...,9,1,7.0,4.0,8.0,0,0,0,0,12414.93


In [180]:
col = 'Var17'
df['Var17_1'] = pd.qcut(df[col], 10, labels=False, duplicates='drop')

In [181]:
df['Var17_1'].value_counts(dropna=False)

1    355
0    355
3    355
5    355
7    355
9    355
4    354
2    354
6    354
8    354
Name: Var17_1, dtype: int64

In [182]:
df.drop('Var17', axis=1, inplace=True)

In [183]:
col = 'working_months'
df['working_months_1'] = pd.qcut(df[col], 10, labels=False, duplicates='drop')

In [184]:
df['working_months_1'].value_counts(dropna=False)

0    361
5    360
4    359
8    357
1    357
7    355
3    355
6    349
2    347
9    346
Name: working_months_1, dtype: int64

In [185]:
df.drop('working_months', axis=1, inplace=True)

### Add columns ii_ratio and idi_ratio from previous df

In [186]:
# Add columns by left join
df = df.merge(df_cont[['ID', 'ii_ratio', 'idi_ratio']], how='left', on='ID')

### Change columns ii_ratio and idi_ratio to categorical

There are no NAs

In [187]:
# Continuous columns to be changed to categorical
columns = ['ii_ratio', 'idi_ratio', 'MOB']

for col in columns:
    # Perform quantile cut on filtered DataFrame
    df[col] = pd.qcut(df[col], 10, labels=False, duplicates='drop')

  diff_b_a = subtract(b, a)


### Final check of df

In [188]:
df.head()

Unnamed: 0,MOB,ID,customer_id,Var9,Var1,Var4,Var5,Var6,Var7,Var15,...,Var25_q,Var26_q,Var20_1,Var21_1,Var22_1,Var23_1,Var17_1,working_months_1,ii_ratio,idi_ratio
0,9,36034977,32653719,4200,1,1,1,1,1,1,...,2.0,3.0,1,3,3,3,1,2,5,6
1,9,36034978,32832365,5880,2,1,2,6,8,0,...,4.0,1.0,1,1,1,1,4,5,9,9
2,9,36034979,32544742,4200,1,0,2,1,0,0,...,1.0,3.0,0,0,0,0,0,4,4,2
3,9,36034981,32592943,3120,2,2,8,3,2,0,...,0.0,2.0,2,2,2,2,3,6,4,0
4,9,36034982,32601182,4440,2,4,6,1,2,0,...,1.0,10.0,0,0,0,0,2,7,3,7


In [189]:
df.isna().sum()

MOB                 0
ID                  0
customer_id         0
Var9                0
Var1                0
Var4                0
Var5                0
Var6                0
Var7                0
Var15               0
Var16               0
Var24               0
Var29               0
Var30               0
Var3                0
Var11               0
Var12               0
Var14               0
target              0
Var27               0
Var28               0
income              0
loan_desc           0
Var8_q              0
Var25_q             0
Var26_q             0
Var20_1             0
Var21_1             0
Var22_1             0
Var23_1             0
Var17_1             0
working_months_1    0
ii_ratio            0
idi_ratio           0
dtype: int64

In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3546 entries, 0 to 3545
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MOB               3546 non-null   int64  
 1   ID                3546 non-null   int64  
 2   customer_id       3546 non-null   int64  
 3   Var9              3546 non-null   int64  
 4   Var1              3546 non-null   int64  
 5   Var4              3546 non-null   int64  
 6   Var5              3546 non-null   int64  
 7   Var6              3546 non-null   int64  
 8   Var7              3546 non-null   int64  
 9   Var15             3546 non-null   int64  
 10  Var16             3546 non-null   int64  
 11  Var24             3546 non-null   int64  
 12  Var29             3546 non-null   int64  
 13  Var30             3546 non-null   int64  
 14  Var3              3546 non-null   float64
 15  Var11             3546 non-null   int64  
 16  Var12             3546 non-null   float64


In [191]:
# drop var9
df.drop('Var9', axis=1, inplace=True)

#### Write df as csv

In [192]:
df.to_csv('../data/output/categorical_dfs/testing_sample_cleaned_categorical.csv', index = False)  # Save the cleaned data to a new CSV file