In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('Loan_Data.csv')

In [4]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Data Cleaning

##### Dependents

In [5]:
df.isnull().sum() #vamos ver se ha nulls

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
df.dtypes # vamos ver o datatpye para cada column

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [6]:
df.columns=df.columns.str.lower() # vamos standirdizar o nome das colunas

In [7]:
df.head()

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [46]:
#aplicationincome e loanamount estão com nomes estranhos, vamos adicionar _
df.rename(columns={'applicantincome': 'applicant_income','coapplicantincome':'coapplicant_income','loanamount':'loan_amount'}, inplace=True)

In [9]:
df.head() #better

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,applicant_income,coapplicant_income,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [10]:
count = df['dependents'].value_counts()['3+'] #podemos dizer que 8.31% da info em "dependants" é 3+, 
print("Number of '3+':", count)               #mais vale mudar para 4 e deixamos a observação de que 4= +3

Number of '3+': 51


In [11]:
value_counts = df['dependents'].value_counts()  #agora sabemos que tudo  o que for 4= +3
print(value_counts)

0     345
1     102
2     101
3+     51
Name: dependents, dtype: int64


In [12]:
df.loc[df['dependents'] == '3+', 'dependents'] = '3'

In [13]:
value_counts = df['dependents'].value_counts()  #agora sabemos que tudo  o que for 3= +3
print(value_counts)          

0    345
1    102
2    101
3     51
Name: dependents, dtype: int64


In [17]:
df.dtypes

loan_id                object
gender                 object
married                object
dependents             object
education              object
self_employed          object
applicant_income        int64
coapplicant_income    float64
loanamount            float64
loan_amount_term      float64
credit_history        float64
property_area          object
loan_status            object
dtype: object

In [14]:
df["dependents"].astype(int)

ValueError: cannot convert float NaN to integer

In [19]:
 df['dependents'].isnull().sum() #temos 15 NaN em dependants

15

In [15]:
df['dependents'].fillna('4', inplace=True) # Vamoos substituir os NaNs por Uknowns \\ tudo o for 4 é Uknown
print('NaNs in Dependents:',df['dependents'].isnull().sum())

NaNs in Dependents: 0


In [16]:
value_counts = df['dependents'].value_counts()  
print(value_counts) 

0    345
1    102
2    101
3     51
4     15
Name: dependents, dtype: int64


In [17]:
df["dependents"].astype(int)

0      0
1      1
2      0
3      0
4      0
      ..
609    0
610    3
611    1
612    2
613    0
Name: dependents, Length: 614, dtype: int32

###### Gender

In [18]:
df.isnull().sum()

loan_id                0
gender                13
married                3
dependents             0
education              0
self_employed         32
applicant_income       0
coapplicant_income     0
loanamount            22
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

In [19]:
df.dtypes

loan_id                object
gender                 object
married                object
dependents             object
education              object
self_employed          object
applicant_income        int64
coapplicant_income    float64
loanamount            float64
loan_amount_term      float64
credit_history        float64
property_area          object
loan_status            object
dtype: object

In [20]:
value_counts = df['gender'].value_counts()  
print(value_counts) 

Male      489
Female    112
Name: gender, dtype: int64


In [23]:
 df['gender'].isnull().sum() #temos 13 NaN em dependants 

13

In [26]:
df['gender'].fillna('UNKN', inplace=True)

In [27]:
value_counts = df['gender'].value_counts()  
print(value_counts) 

Male      489
Female    112
UNKN       13
Name: gender, dtype: int64


In [28]:
 df['gender'].isnull().sum() #temos 13 NaN em Gender 

0

##### Married

In [29]:
value_counts = df['married'].value_counts()  
print(value_counts)

Yes    398
No     213
Name: married, dtype: int64


In [32]:
df['married'].isnull().sum() #temos 3 NaN em Married 

3

In [35]:
df['married'].fillna('Other', inplace=True)

In [36]:
value_counts = df['married'].value_counts()  
print(value_counts)

Yes      398
No       213
Other      3
Name: married, dtype: int64


In [37]:
df['married'].isnull().sum() #temos 3 NaN em Married 

0

In [38]:
df.isnull().sum()

loan_id                0
gender                 0
married                0
dependents             0
education              0
self_employed         32
applicant_income       0
coapplicant_income     0
loanamount            22
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

##### Self Employed

In [39]:
value_counts = df['self_employed'].value_counts()  
print(value_counts)

No     500
Yes     82
Name: self_employed, dtype: int64


In [41]:
df['self_employed'].isnull().sum() #temos 32 NaN em self_employed 

32

In [42]:
df['self_employed'].fillna('Other', inplace=True)

In [43]:
df['married'].isnull().sum() #temos 3 NaN em Married 

0

In [44]:
value_counts = df['self_employed'].value_counts()  
print(value_counts)

No       500
Yes       82
Other     32
Name: self_employed, dtype: int64


In [47]:
df.isnull().sum()

loan_id                0
gender                 0
married                0
dependents             0
education              0
self_employed          0
applicant_income       0
coapplicant_income     0
loan_amount           22
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

##### Loan Amount


In [50]:
df['loan_amount'].isnull().sum() #temos 32 NaN em self_employed 

22

In [53]:
df['loan_amount'].fillna('UNKN', inplace=True)

In [54]:
df['loan_amount'].isnull().sum() #temos 32 NaN em self_employed 

0

In [55]:
df.isnull().sum()

loan_id                0
gender                 0
married                0
dependents             0
education              0
self_employed          0
applicant_income       0
coapplicant_income     0
loan_amount            0
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

##### Loan amount term

In [59]:
df['loan_amount_term'].isnull().sum() #temos 14 NaN em loan_amount_term 

14

In [61]:
value_counts = df['loan_amount_term'].value_counts()  
print(value_counts)

360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: loan_amount_term, dtype: int64


In [None]:
df['loan_amount_term'].fillna('UNKN', inplace=True)

In [62]:
value_counts = df['loan_status'].value_counts()  
print(value_counts)

Y    422
N    192
Name: loan_status, dtype: int64


In [None]:
## usar median possibilidade \\ 