# Importing and getting a glimpse of data 

In [3]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import plot as plt
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [4]:
train_data = pd.read_csv('application_train.csv') #reads the csv file
print(train_data.shape)
train_data.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
test_data = pd.read_csv('application_test.csv')
print(test_data.shape)
test_data.head()

(48744, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [58]:
train_data['TARGET'].value_counts() #Counts the number of applications whose loan was repayed '0' and not repayed '1'

0    282686
1     24825
Name: TARGET, dtype: int64

In [59]:
Gender=train_data['CODE_GENDER'].value_counts()  #Gender distribution of the train data
Gender_perc = Gender/len(train_data)
print(Gender_perc*100)


F      65.834393
M      34.164306
XNA     0.001301
Name: CODE_GENDER, dtype: float64


# Missing data

In [61]:
def missing_data(data): #calculates missing values in each column
    total = data.isnull().sum().sort_values(ascending=False)
    return total

In [62]:
missing_data(train_data).head()

COMMONAREA_MEDI             214865
COMMONAREA_AVG              214865
COMMONAREA_MODE             214865
NONLIVINGAPARTMENTS_MODE    213514
NONLIVINGAPARTMENTS_MEDI    213514
dtype: int64

In [63]:
missing_data(test_data).head()

COMMONAREA_MEDI             33495
COMMONAREA_AVG              33495
COMMONAREA_MODE             33495
NONLIVINGAPARTMENTS_MODE    33347
NONLIVINGAPARTMENTS_MEDI    33347
dtype: int64

# Encoding of Categorical variables

In [68]:
def one_hot_encoder(data):
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns if data[col].dtype=='object']
    data = pd.get_dummies(data,columns= categorical_columns)
    return data


In [71]:
 A = one_hot_encoder(train_data).head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100002,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,...,0,1,0,0,0,0,0,0,1,0
2,100004,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,...,0,0,0,0,0,0,0,0,0,0


In [14]:
one_hot_encoder(test_data).head()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100001,0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241,-2329,-5170.0,...,0,0,0,0,0,0,1,0,1,0
1,100005,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469,-9118.0,...,0,0,0,0,0,0,0,0,0,0
2,100013,0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038,-4458,-2175.0,...,0,0,0,0,0,0,0,0,0,0
3,100028,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976,-1866,-2000.0,...,0,0,0,0,0,1,0,0,1,0
4,100038,1,180000.0,625500.0,32067.0,625500.0,0.010032,-13040,-2191,-4000.0,...,0,0,0,0,0,0,0,0,0,0


# Male vs Female(Loan repayment)

In [43]:
B = train_data['CODE_GENDER'].value_counts()
C = B.drop('XNA')
labels = C.index
values = C.values

tr = go.Bar(x=labels,y = values,width = 0.6)
layout = go.Layout(title = 'Male vs Female Applicants', height = 300)
fig = go.Figure(data=[tr], layout=layout)
iplot(fig)

In [31]:
FEMALE= train_data[train_data.CODE_GENDER == 'F']
MALE = train_data[train_data.CODE_GENDER == 'M']

FEMALE_TRUE = len(FEMALE[FEMALE.TARGET == 0])/ len(FEMALE)
FEMALE_FALSE = len(FEMALE[FEMALE.TARGET == 1]) / len(FEMALE) 

MALE_TRUE = len(MALE[MALE.TARGET == 0])/ len(MALE)
MALE_FALSE = len(MALE[MALE.TARGET == 1]) / len(MALE)

print("Males who repayed their loan on time: "+"{:.2%}".format(MALE_TRUE));
print("Females who repayed their loan on time: "+"{:.2%}".format(FEMALE_TRUE));

Males who repayed their loan on time: 89.86%
Females who repayed their loan on time: 93.00%


Females are better loan repayers than males and males are slightly falling behind females when it comes to repaying home loans. 

# Effect of number of children 

In [33]:
CH_loan = train_data[(train_data['CNT_CHILDREN'] >= 2)]
CH1 = len(CH_loan[CH_loan['TARGET'] == 1])/len(CH_loan)
print("Applicants who had two or more than two children and didn't payed their loan on time : "+"{:.2%}".format(CH1))

Applicants who had two or more than two children and didn't payed their loan on time : 8.90%


In [34]:
CH_Loan  = train_data[(train_data['CNT_CHILDREN'] >= 3)]
CH2 = len(CH_Loan[CH_Loan['TARGET'] == 1])/len(CH_Loan)
print("Applicants who had three or more than three children and didn't payed their loan on time : "+"{:.2%}".format(CH2))


Applicants who had three or more than three children and didn't payed their loan on time : 10.04%


This shows that the number of children doesn't have much affect on the ability of applicants to repay their home loan as the difference is merely 1.5% in bewtween them. 

# Type of Loan

**Cash loan** : It is a loan which is received by the borrower in cash. It is a short term loan.

**Revolving loan** : It offers borrowers the option to draw funds up to a credit limit, repay and redraw them as they see fit. 


In [19]:
train_data['NAME_CONTRACT_TYPE'].value_counts() # number of cash loans and revolving loans issued

Cash loans         278232
Revolving loans     29279
Name: NAME_CONTRACT_TYPE, dtype: int64

In [10]:
A = train_data['NAME_CONTRACT_TYPE'].value_counts()

labels = A.index
values = A.values
colors = ['rgba(55, 12, 93, .7)','rgba(125, 42, 123, .1)']

Plot = go.Pie(labels=labels, values=values,marker=dict(colors=colors,line=dict(color='#fff', width= 3)))
layout = go.Layout(title='Types of Loans', height=380)
fig = go.Figure(data=[Plot], layout=layout)
iplot(fig)

In [None]:
CASH_loan = train_data[(train_data['NAME_CONTRACT_TYPE'] == 'Cash loans')]
REV_loan = train_data[(train_data['NAME_CONTRACT_TYPE'] == 'Revolving loans')]

Cash_loan_repayed = len(CASH_loan[CASH_loan['TARGET'] == 0])/len(CASH_loan)
print("Applicants having Cash loans which were repayed on time:"+"{:.2%}".format(Cash_loan_repayed))
                        

In [38]:
Rev_loan_repayed = len(REV_loan[REV_loan['TARGET'] == 0])/len(REV_loan)
print("Applicants having Revolving loans which were repayed on time: "+"{:.2%}" .format(Rev_loan_repayed))

Applicants having Revolving loans which were repayed on time: 94.52%


Revolving loans are considered more dangerous way to borrow than the cash loans which is justified by the number of applicants receiving revolving loans as compared to cash loans but the percentage of people repaying loan is more in case of revolving than cash loans which marks that only the people who are able to handle revolving credit are taking revolving loans.

# Correlation of features with the target variable

**Correlation** : A measure used to represent how strongly two random varaibles are related to each other.
- It is the scaled form of covariance.
- Its value ranges from -1 to +1. 
- It is the special case of covariance which can be obtained when the data is standardized.

In [21]:
correlation = train_data.corr()['TARGET'].sort_values()

In [23]:
correlation.head() # These are the most negative correlations amongst the features of the data with the target variable

EXT_SOURCE_3    -0.178919
EXT_SOURCE_2    -0.160472
EXT_SOURCE_1    -0.155317
DAYS_EMPLOYED   -0.044932
FLOORSMAX_AVG   -0.044003
Name: TARGET, dtype: float64

In [24]:
correlation.tail()   # These are the most positive correlations amongst the features of the data with the target variable

DAYS_LAST_PHONE_CHANGE         0.055218
REGION_RATING_CLIENT           0.058899
REGION_RATING_CLIENT_W_CITY    0.060893
DAYS_BIRTH                     0.078239
TARGET                         1.000000
Name: TARGET, dtype: float64

# Annual total income

In [20]:
inc = train_data[train_data['TARGET'] == 0]['AMT_INCOME_TOTAL'].mean()
print("The average annual total income of the loan repayers:",inc)

The average annual total income of the loan repayers: 169077.7222658179


In [19]:
inc_d = train_data[train_data['TARGET'] == 1]['AMT_INCOME_TOTAL'].mean()
print("The average annual total income of the loan defaulters:",inc_d)

The average annual total income of the loan defaulters: 165611.76090634443


In [23]:
max_inc= train_data['AMT_INCOME_TOTAL'].max()
print("The maximum annual total income of an applicant in the data:",max_inc)

The maximum annual total income of an applicant in the data: 117000000.0


In [24]:
min_inc = train_data['AMT_INCOME_TOTAL'].min()
print("The minimum annual total income of an applicant in the data:",min_inc)

The minimum annual total income of an applicant in the data: 25650.0


In [29]:
print('\033[1m'+'To be continued...')

[1mTo be continued...
