In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from IPython import display
display.clear_output()

# **Data Processing**

In [None]:
data = pd.read_csv('credito4.csv', sep=';')
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data['SALDO_ATUAL'].fillna(data.loc[:, 'SALDO_ATUAL'].median(), inplace = True)
data['SALDO_ATUAL'].isnull().sum()

In [None]:
data.groupby(['ESTADOCIVIL']).size()

In [None]:
data['ESTADOCIVIL'].fillna('masculino solteiro', inplace = True)

In [None]:
data.isnull().sum()

In [None]:
data['RESIDENCIADESDE'].describe()

In [None]:
data['IDADE'].describe()

In [None]:
data.loc[(data['IDADE'] < 18) | (data['IDADE'] > 120), 'IDADE'] = data.loc[:, 'IDADE'].median() 
data['IDADE'].describe()

In [None]:
data.groupby('OUTROSPLANOSPGTO').size()

In [None]:
data.groupby('PROPOSITO').size()

In [None]:
data.groupby('CLASSE').size()

In [None]:
data['SALDO_ATUAL'].describe()

In [None]:
std = data['SALDO_ATUAL'].std()
data.loc[data['SALDO_ATUAL'] >= 2*std, 'SALDO_ATUAL'] = data.loc[:, 'SALDO_ATUAL'].median() 

In [None]:
data.loc[(data['PROPOSITO'] == 'qualificação') | (data['PROPOSITO'] == 'mobilia/equipamento') | (data['PROPOSITO'] == 'Eletrodomésticos'), 'PROPOSITO'] = 'outros'
data.groupby('PROPOSITO').size()

In [None]:
y = data['CLASSE']
x = data.iloc[:, 0:7]

In [None]:
x['DATA'] = pd.to_datetime(x['DATA'], format='%d/%m/%Y')
x['DATA']

In [None]:
x['YEAR'] = x['DATA'].dt.year
x['MONTH'] = x['DATA'].dt.month
x['DAY'] = x['DATA'].dt.day_name()

# **Label Encoder**

In [None]:
label1 = LabelEncoder()
x['ESTADOCIVIL'] = label1.fit_transform(x['ESTADOCIVIL'])
label2 = LabelEncoder()
x['PROPOSITO'] = label2.fit_transform(x['PROPOSITO'])
label3 = LabelEncoder()
x['DAY'] = label3.fit_transform(x['DAY'])

In [None]:
z = pd.get_dummies(x['OUTROSPLANOSPGTO'], prefix= 'PLANOS')
z

# **Standard Scaler**

In [None]:
# z-score
sc = StandardScaler()
m = sc.fit_transform(x.iloc[:, 0:3])

In [None]:
x = pd.concat([x, z, pd.DataFrame(m, columns=['SALDO_ATUAL_N', 'RESIDENCIADESDE_N', 'IDADE_N'])], axis = 1)

In [None]:
x

In [None]:
x.drop(columns=['SALDO_ATUAL', 'RESIDENCIADESDE', 'IDADE', 'DATA', 'PLANOS_banco'], inplace = True)
x

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
forest = RandomForestClassifier()
forest.fit(X_train, Y_train)
forest.estimators_

In [None]:
predict = forest.predict(X_test)
confusion = confusion_matrix(Y_test, predict)
confusion

In [None]:
accuracy = accuracy_score(Y_test, predict)
accuracy

# **PCA Decomposition**

In [None]:
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

In [None]:
iris = load_iris()
classes = iris.target
predictor = iris.data

In [None]:
predictor = sc.fit_transform(predictor)

In [None]:
X1_train, X1_test, Y1_train, Y1_test = train_test_split(predictor, classes, test_size = 0.3, random_state = 123)

In [None]:
forest_iris = RandomForestClassifier(n_estimators = 100, random_state = 1234)
forest_iris.fit(X1_train, Y1_train)

In [None]:
predict_iris = forest_iris.predict(X1_test)
confmatrix_iris = confusion_matrix(Y1_test, predict_iris)
accuracy_iris = accuracy_score(Y1_test, predict_iris)
accuracy_iris

In [None]:
pca = PCA(n_components = 3)
predictor = pca.fit_transform(predictor)

In [None]:
X2_train, X2_test, Y2_train, Y2_test = train_test_split(predictor, classes, test_size = 0.3, random_state = 123)

In [None]:
forest1 = RandomForestClassifier(n_estimators = 100, random_state = 1234)
forest1.fit(X2_train, Y2_train)
predict1 = forest1.predict(X2_test)
accuracy1 = accuracy_score(Y2_test, predict1)
accuracy1