# 1. Import dataset

This dataset is imported from https://www.kaggle.com/datasets/caesarmario/application-data, where the data is obtained from a other Kaggle dataset wich have very dirty data, with a lot of duplicated and missing values. So I try to clean and classify the data by myself but I failed, afortunately I found this other dataset that is clean and ready to use. 

In [2]:
import pandas as pd

data = pd.read_csv('raw/applications.csv')
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 25128 entries, 0 to 25127
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Applicant_ID          25128 non-null  int64
 1   Applicant_Gender      25128 non-null  str  
 2   Owned_Car             25128 non-null  int64
 3   Owned_Realty          25128 non-null  int64
 4   Total_Children        25128 non-null  int64
 5   Total_Income          25128 non-null  int64
 6   Income_Type           25128 non-null  str  
 7   Education_Type        25128 non-null  str  
 8   Family_Status         25128 non-null  str  
 9   Housing_Type          25128 non-null  str  
 10  Owned_Mobile_Phone    25128 non-null  int64
 11  Owned_Work_Phone      25128 non-null  int64
 12  Owned_Phone           25128 non-null  int64
 13  Owned_Email           25128 non-null  int64
 14  Job_Title             25128 non-null  str  
 15  Total_Family_Members  25128 non-null  int64
 16  Applicant_Age  

# 2. Rename dataset columns to minimalist names

In [3]:
renamed_columns = {
    'Applicant_ID': 'id',
    'Applicant_Gender': 'gender',
    'Owned_Car': 'car',
    'Owned_Realty': 'realty',
    'Total_Children': 'cnt_children',
    'Total_Income': 'income',
    'Income_Type': 'income_type',
    'Education_Type': 'education_type',
    'Family_Status': 'fam_status',
    'Housing_Type': 'housing_type',
    'Owned_Mobile_Phone': 'mobile_phone',
    'Owned_Work_Phone': 'work_phone',
    'Owned_Phone': 'phone',
    'Owned_Email': 'email',
    'Job_Title': 'job_title',
    'Total_Family_Members': 'cnt_fam_members',
    'Applicant_Age': 'age',
    'Years_of_Working': 'work_experience',
    'Total_Bad_Debt': 'bad_debt',
    'Total_Good_Debt': 'good_debt',
    'Status': 'status'
}
data.rename(columns=renamed_columns, inplace=True)
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 25128 entries, 0 to 25127
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   id               25128 non-null  int64
 1   gender           25128 non-null  str  
 2   car              25128 non-null  int64
 3   realty           25128 non-null  int64
 4   cnt_children     25128 non-null  int64
 5   income           25128 non-null  int64
 6   income_type      25128 non-null  str  
 7   education_type   25128 non-null  str  
 8   fam_status       25128 non-null  str  
 9   housing_type     25128 non-null  str  
 10  mobile_phone     25128 non-null  int64
 11  work_phone       25128 non-null  int64
 12  phone            25128 non-null  int64
 13  email            25128 non-null  int64
 14  job_title        25128 non-null  str  
 15  cnt_fam_members  25128 non-null  int64
 16  age              25128 non-null  int64
 17  work_experience  25128 non-null  int64
 18  bad_debt         

# 3. Check for duplicated values

In [4]:
data.duplicated().sum()

np.int64(0)

In [5]:
data.duplicated(subset=list(renamed_columns.values()).remove('id')).sum()

np.int64(0)

# 4. Treat the different columns types

## 4.1. Get the columns types

In [6]:
id_col = 'id'
outcome_col = 'status'

binary_columns = [
    'car',
    'realty',
    'mobile_phone',
    'work_phone',
    'phone',
    'email',
]

categorical_columns = [
    'gender',
    'income_type',
    'education_type',
    'fam_status',
    'housing_type',
    'job_title',
]

numerical_columns = list(
    set(data.columns) - set(categorical_columns) - set(binary_columns) - {id_col, outcome_col}
)

## 4.2. Treat categorical columns

In [7]:
import re

def delete_slash(text: str) -> str:
    # keep the word before the first slash
    return re.split(r"\s*/\s*", text, maxsplit=1)[0].strip().replace(' ', '_').lower()

for col in categorical_columns:
    data[col] = data[col].apply(lambda x: delete_slash(x) if isinstance(x, str) else x)
    print(data[col].unique().tolist())

['m', 'f']
['working', 'commercial_associate', 'state_servant', 'student', 'pensioner']
['secondary', 'higher_education', 'incomplete_higher', 'lower_secondary', 'academic_degree']
['married', 'single', 'civil_marriage', 'separated', 'widow']
['house', 'rented_apartment', 'municipal_apartment', 'with_parents', 'co-op_apartment', 'office_apartment']
['security_staff', 'sales_staff', 'accountants', 'laborers', 'managers', 'drivers', 'core_staff', 'high_skill_tech_staff', 'cleaning_staff', 'private_service_staff', 'cooking_staff', 'low-skill_laborers', 'medicine_staff', 'secretaries', 'waiters', 'hr_staff', 'realty_agents', 'it_staff']


In [8]:
data = pd.get_dummies(
    data, 
    columns=categorical_columns, 
    drop_first=True,
    dtype=int
)
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 25128 entries, 0 to 25127
Data columns (total 50 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   id                                25128 non-null  int64
 1   car                               25128 non-null  int64
 2   realty                            25128 non-null  int64
 3   cnt_children                      25128 non-null  int64
 4   income                            25128 non-null  int64
 5   mobile_phone                      25128 non-null  int64
 6   work_phone                        25128 non-null  int64
 7   phone                             25128 non-null  int64
 8   email                             25128 non-null  int64
 9   cnt_fam_members                   25128 non-null  int64
 10  age                               25128 non-null  int64
 11  work_experience                   25128 non-null  int64
 12  bad_debt                          25128 non

## 4.3. Standardize numerical columns

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])
data.head(10)

Unnamed: 0,id,car,realty,cnt_children,income,mobile_phone,work_phone,phone,email,cnt_fam_members,...,job_title_laborers,job_title_low-skill_laborers,job_title_managers,job_title_medicine_staff,job_title_private_service_staff,job_title_realty_agents,job_title_sales_staff,job_title_secretaries,job_title_security_staff,job_title_waiters
0,5008806,1,1,-0.66779,-0.787766,1,0,0,0,-0.313622,...,0,0,0,0,0,0,0,0,1,0
1,5008808,0,1,-0.66779,0.719137,1,0,1,1,-1.390219,...,0,0,0,0,0,0,1,0,0,0
2,5008809,0,1,-0.66779,0.719137,1,0,1,1,-1.390219,...,0,0,0,0,0,0,1,0,0,0
3,5008810,0,1,-0.66779,0.719137,1,0,1,1,-1.390219,...,0,0,0,0,0,0,1,0,0,0
4,5008811,0,1,-0.66779,0.719137,1,0,1,1,-1.390219,...,0,0,0,0,0,0,1,0,0,0
5,5008815,1,1,-0.66779,0.719137,1,1,1,1,-0.313622,...,0,0,0,0,0,0,0,0,0,0
6,5008819,1,1,-0.66779,-0.572494,1,0,0,0,-0.313622,...,1,0,0,0,0,0,0,0,0,0
7,5008820,1,1,-0.66779,-0.572494,1,0,0,0,-0.313622,...,1,0,0,0,0,0,0,0,0,0
8,5008821,1,1,-0.66779,-0.572494,1,0,0,0,-0.313622,...,1,0,0,0,0,0,0,0,0,0
9,5008822,1,1,-0.66779,-0.572494,1,0,0,0,-0.313622,...,1,0,0,0,0,0,0,0,0,0


# 5. Save the prepared dataset

In [11]:
data = data[[c for c in data.columns if c != outcome_col] + [outcome_col]]
data.to_csv('clean/applications.csv', index=False)