# <center> **Home Credit Default Risk Assessment**
# <center> **Pre-Processing**

# **Libraries**

In [3]:
import pandas as pd
import numpy as np

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [3]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)


test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Pre-Processing**

## **Removing Empty Features**

In [6]:
list_columns = functions.check_columns_with_one_uniquevalue(test)

In [7]:
train = train.drop(list_columns, axis = 1)

In [8]:
train.shape

(307511, 111)

## **Removing Inaccurate Entries**

In [9]:
train = train[train['CODE_GENDER'] != 'XNA']
train['DAYS_EMPLOYED'][train['DAYS_EMPLOYED'] == 365243] = np.nan

## **Remove Infinity Values**

In [10]:
train.replace([np.inf, -np.inf], np.nan, inplace=True)

## **'DAY_BIRTH' to 'AGE' in Years**

In [11]:
train["DAYS_BIRTH"] = abs(train["DAYS_BIRTH"])
train["DAYS_BIRTH"] = train["DAYS_BIRTH"]/365
train.rename(columns={'DAYS_BIRTH': 'AGE'}, inplace=True)
train["AGE"].describe()  

count    307507.000000
mean         43.937061
std          11.956116
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: AGE, dtype: float64

## **'DAYS_EMPLOYED' to 'YEARS_EMPLOYED'**

In [12]:
train["DAYS_EMPLOYED"] = abs(train["DAYS_EMPLOYED"])
train["DAYS_EMPLOYED"] = train["DAYS_EMPLOYED"]/365
train.rename(columns={"DAYS_EMPLOYED": 'YEARS_EMPLOYED'}, inplace=True)
train["YEARS_EMPLOYED"].describe()  

count    252133.000000
mean          6.531897
std           6.406377
min           0.000000
25%           2.101370
50%           4.515068
75%           8.698630
max          49.073973
Name: YEARS_EMPLOYED, dtype: float64

## **'DAYS_ID_PUBLISH' to 'YEARS_ID_PUBLISH'**

In [13]:
train["DAYS_ID_PUBLISH"] = abs(train["DAYS_ID_PUBLISH"])
train["DAYS_ID_PUBLISH"] = train["DAYS_ID_PUBLISH"]/365
train.rename(columns={"DAYS_ID_PUBLISH": 'YEARS_ID_PUBLISH'}, inplace=True)
train["YEARS_ID_PUBLISH"].describe()  

count    307507.000000
mean          8.203292
std           4.135492
min           0.000000
25%           4.712329
50%           8.915068
75%          11.778082
max          19.717808
Name: YEARS_ID_PUBLISH, dtype: float64

## **'DAYS_REGISTRATION' to 'YEARS_REGISTRATION'**

In [14]:
train["DAYS_REGISTRATION"] = abs(train["DAYS_REGISTRATION"])
train["DAYS_REGISTRATION"] = train["DAYS_REGISTRATION"]/365
train.rename(columns={"DAYS_REGISTRATION": 'YEARS_REGISTRATION'}, inplace=True)
train["YEARS_REGISTRATION"].describe()  

count    307507.000000
mean               NaN
std           0.000000
min           0.000000
25%           5.507812
50%          12.335938
75%          20.500000
max          67.625000
Name: YEARS_REGISTRATION, dtype: float64

## **'DAYS_LAST_PHONE_CHANGE' to 'YEARS_LAST_PHONE_CHANGE'**

In [15]:
train["DAYS_LAST_PHONE_CHANGE"] = abs(train["DAYS_LAST_PHONE_CHANGE"])
train["DAYS_LAST_PHONE_CHANGE"] = train["DAYS_LAST_PHONE_CHANGE"]/365
train.rename(columns={"DAYS_LAST_PHONE_CHANGE": 'YEARS_LAST_PHONE_CHANGE'}, inplace=True)
train["YEARS_LAST_PHONE_CHANGE"].describe()  

count    307506.000000
mean               NaN
std           0.000000
min           0.000000
25%           0.750488
50%           2.074219
75%           4.300781
max          11.757812
Name: YEARS_LAST_PHONE_CHANGE, dtype: float64

# **Missing Values**

In [None]:
functions.MissingValues(train)

# **Save Dataframe as CSV File**

In [17]:
train.shape

(307507, 111)

In [18]:
train.to_csv(r"C:\Users\Dell\Documents\AI\Risk\Data\Data\train 20.csv", index=False)