# **Pré-processamento de dados usando Python com Pandas e Sklearn**

### **Import dos dados e estatísticas básicas**

In [1]:
import pandas as pd
import numpy as np
import statistics as sts 

In [2]:
# Import dos dados
base = pd.read_csv('credit_data.csv',)

In [3]:
# Dataframe
base.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [4]:
# Tipo de dados nas coluans
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   i#clientid  2000 non-null   int64  
 1   income      2000 non-null   float64
 2   age         1997 non-null   float64
 3   loan        2000 non-null   float64
 4   c#default   2000 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 78.2 KB


In [5]:
# Dados estatísticos dos campos
base.describe()

Unnamed: 0,i#clientid,income,age,loan,c#default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


### **Tratamento de dados inconsistentes:**

In [6]:
# Localizar idades fora do padrão (no caso, negativas e maiores que 110)
base.loc[base['age'] < 0]

Unnamed: 0,i#clientid,income,age,loan,c#default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [7]:
# Metodos: Apagar os registros com idades fora do padrão
# base.drop(base[base.age < 0].index, inplace=True)


In [8]:
# Preencher com a média da coluna Idade
# OBS: NÃO ESQUECER DE RETIRAR AS IDADES FORA DOS EIXOS.
base['age'][base.age > 0].mean()

40.92770044906149

In [9]:
# Implantando:
base.loc[base.age < 0, 'age'] = 40.92
# Buscando novamente valores negativos:
base.loc[base['age'] < 0]

Unnamed: 0,i#clientid,income,age,loan,c#default


### **Tratamento de dados faltantes**

In [10]:
# Criando dataframe de booleans para melhor visualizar dados faltantes
nulls = pd.isnull(base)

In [11]:
nulls[nulls['age']] == True

Unnamed: 0,i#clientid,income,age,loan,c#default
28,False,False,True,False,False
30,False,False,True,False,False
31,False,False,True,False,False


In [12]:
nulls[nulls['income']] == True

Unnamed: 0,i#clientid,income,age,loan,c#default


In [13]:
nulls[nulls['loan']] == True

Unnamed: 0,i#clientid,income,age,loan,c#default


In [14]:
nulls[nulls['c#default']] == True

Unnamed: 0,i#clientid,income,age,loan,c#default


In [15]:
nulls[nulls['i#clientid']] == True

Unnamed: 0,i#clientid,income,age,loan,c#default


In [16]:
base.loc[pd.isnull(base['age'])] 

Unnamed: 0,i#clientid,income,age,loan,c#default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


**METODO 1 PARA SUBSTITUIR:**

In [17]:
# base.loc[pd.isnull(base['age'])] = 40.92

In [18]:
# Conferindo se existem valores nulos ainda:
# base.loc[pd.isnull(base['age'])] 

### **Divisão de atributos de previsores x classes**

In [19]:
# Passando os atributos income, age e loan para os previsores
previsores = base.iloc[:, 1:4].values

In [20]:
previsores

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [21]:
# Passando o atributo default como classe a ser prevista
classes = base.iloc[:, 4].values

In [22]:
classes

array([0, 0, 0, ..., 1, 0, 0])

### **Import do sklearn para substituir valores null**

In [23]:
# Import do Imputer
from sklearn.impute import SimpleImputer

In [24]:
# Exemplo que estava com null:
previsores[28]

array([59417.80540626,            nan,  2082.62593812])

In [25]:
# Criando objeto Imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [26]:
imputer = imputer.fit(previsores[:,0:3])

In [27]:
previsores[:,0:3] = imputer.transform(previsores[:,0:3])

In [28]:
# Exemplo que estava age = null 
# Mas agora com a média da coluna:
previsores[28]

array([5.94178054e+04, 4.09276889e+01, 2.08262594e+03])

### **Escalonamento de atributos**

In [29]:
# Podemos optar por padronização(quando há outliers) ou normalização(dados no geral)
# USAREI PADRONIZAÇÃO (STANDARSCALER)
from sklearn.preprocessing import StandardScaler

In [30]:
# Criando um objeto scaler
scaler = StandardScaler()

In [32]:
# Aplicando o scaler nos dados Previsores:
previsores = scaler.fit_transform(previsores)
previsores

array([[ 1.45393393,  1.36538093,  1.20281942],
       [-0.76217555,  0.5426602 ,  0.69642695],
       [ 0.83682073,  1.67417189,  1.17471147],
       ...,
       [-0.07122592, -0.97448519,  0.35420081],
       [-0.11000289,  1.73936739, -0.92675625],
       [ 1.682986  ,  1.14917639,  0.96381038]])