In [2]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import metrics as mt
import statsmodels.formula.api as smf
import statsmodels.api as sm


In [3]:
#Importing database
df = pd.read_csv("database/train.csv")
df.head()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional
0,1767,21,278.172008,2577.05,24196.89636,104.306544,31.038763,6,5,7,21,14,9,15,Não,Não,Negar
1,11920,40,268.874152,2465.39,19227.37796,69.858778,36.917093,5,8,5,40,23,10,18,Não,Não,Negar
2,8910,36,446.643127,1055.29,42822.28223,134.201478,34.561714,0,3,6,26,13,3,15,Sim,Não,Negar
3,4964,58,321.141267,703.05,51786.826,297.350067,31.493561,0,3,7,12,7,2,1,Sim,Não,Negar
4,10100,35,428.716114,891.29,44626.85346,134.201478,28.028887,2,8,7,24,10,8,20,Sim,Não,Negar


# Data Preparation

In [4]:
features = ['idade', 'divida_atual', 'renda_anual',
       'valor_em_investimentos', 'taxa_utilizacao_credito', 'num_emprestimos',
       'num_contas_bancarias', 'num_cartoes_credito', 'dias_atraso_dt_venc',
       'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros']
label = 'saldo_atual'

In [5]:
x_train = df.loc[:, features]
y_train = df.loc[:, label]

# Model Training

In [6]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

In [7]:
y_pred = lr_model.predict(x_train)
y_pred

array([346.66954862, 367.84027655, 431.46897895, ..., 412.25174802,
       400.68529863, 455.26482252])

In [8]:
df1 = df.copy()
df1 = df1.loc[:, ['id_cliente', 'saldo_atual']]
df1['pred'] = y_pred
df1.head()

Unnamed: 0,id_cliente,saldo_atual,pred
0,1767,278.172008,346.669549
1,11920,268.874152,367.840277
2,8910,446.643127,431.468979
3,4964,321.141267,445.506463
4,10100,428.716114,378.271169


# Performance

In [9]:
mt.r2_score(y_train, y_pred)

0.16917364489050013

# Model training statsmodels

In [10]:
lr_model = reg2 = smf.ols(formula='saldo_atual ~ idade + divida_atual + num_emprestimos', data = df)
lr_model = lr_model.fit()
anova_results2 = sm.stats.anova_lm(lr_model, typ=1)
anova_results2

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
idade,1.0,408.5925,408.5925,0.009575,0.9220529
divida_atual,1.0,44940050.0,44940050.0,1053.104993,3.537135e-219
num_emprestimos,1.0,3976.109,3976.109,0.093174,0.7601864
Residual,9496.0,405230900.0,42673.85,,


In [11]:
#mean square error - mede o erro quadrado, principal problema é que da alta importancia aos outliers
mse = np.round(mt.mean_squared_error(y_train, y_pred), 2)
print(mse)

39370.27


In [12]:
#root mean square error - a raiz quadrada do erro quadratico, volta para o valor original (a mesma escala da variavel alvo), para meio de comparações

rmse = np.round(mt.root_mean_squared_error(y_train, y_pred), 2)
print(rmse)

198.42


In [13]:
df.describe()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros
count,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0
mean,6210.532947,126.398421,404.532032,1437.08642,195037.3,611.999323,32.270823,10.863684,17.552316,21.433158,20.990316,29.832421,24.937158,66.618632
std,3603.243914,685.036005,217.696725,1156.263217,1454274.0,2002.44388,5.106053,62.555417,120.980186,122.67739,14.800612,222.637763,183.733346,440.118524
min,1.0,18.0,0.088628,0.23,10473.87,0.0,20.992914,0.0,0.0,1.0,-5.0,0.0,0.0,1.0
25%,3091.75,28.0,272.36306,573.62,26207.9,76.083981,28.078872,2.0,3.0,4.0,10.0,9.0,2.0,8.0
50%,6172.5,38.0,337.169588,1175.73,45991.11,134.201478,32.262649,3.0,6.0,6.0,18.0,14.0,4.0,14.0
75%,9320.25,47.0,471.264657,1963.3325,80449.83,249.182915,36.464575,6.0,7.0,7.0,28.0,18.0,8.0,20.0
max,12500.0,8598.0,1602.040519,4998.07,23279660.0,10000.0,49.564519,1496.0,1779.0,1479.0,67.0,4388.0,2589.0,5788.0


In [25]:
#Criando um outlier em apenas 1 resultado na coluna saldo atual e refazendo o rmse

indice_aleatorio = df['saldo_atual'].sample(475).index[0]

# Multiplicar o valor correspondente por 10 e atualizar no DataFrame
df.at[indice_aleatorio, 'saldo_atual'] *= 100

In [26]:
lr_model2 = LinearRegression()
lr_model2.fit(x_train, y_train)
y_pred = lr_model.predict(x_train)
y_pred

0       336.783121
1       343.434261
2       427.354210
3       448.303860
4       437.087727
           ...    
9495    479.688981
9496    442.191232
9497    356.230181
9498    460.030506
9499    439.491648
Length: 9500, dtype: float64

In [27]:
rmse = np.round(mt.root_mean_squared_error(y_train, y_pred), 2)
print(rmse)

739.85


Com 5% de valores outlier o rmse de 198.42 foi para 739.85, em um df com varios outliers teremos um rmse com um valor discrepante