### Importa todas as libs necessárias para criar o modelo

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline

warnings.filterwarnings('ignore')

#### O dataset contém informações sobre clientes de um banco. A variável alvo é o Churn (se o cliente saiu do banco ou não).

Descrição de cada coluna:

1. customer_id, variável não utilizada.
2. credit_score, usado como entrada.
3. país, usado como entrada.
4. gênero, usado como entrada.
5. idade, usada como entrada.
6. posse, usada como entrada.
7. saldo, usado como entrada.
8. número_produtos, usado como entrada.
9. credit_card, usado como entrada.
10. active_member, usado como entrada.
11. salário_estimado, usado como entrada.
12. churn, usado como alvo. 1 se o cliente tiver saído do banco durante algum período ou 0 se não.

In [3]:
# Caminho para o excel onde estão os dados brutos
url = r"C:\Users\Notbook\Desktop\PUC\Eng. Software\MVP\MVP - Qualidade de Software e Sistemas Inteligentes\machineLearning\data\Bank Customer Churn Prediction.csv"

# Lê o arquivo com os dados
df = pd.read_csv(url)

# Vê quantas linhas e colunas existem no DataFrame
print(df.shape)

# Mostra as primeiros 5 linhas do df
df.head()

(10000, 12)


Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


In [5]:
df.describe().round(2)

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.57,650.53,38.92,5.01,76485.89,1.53,0.71,0.52,100090.24,0.2
std,71936.19,96.65,10.49,2.89,62397.41,0.58,0.46,0.5,57510.49,0.4
min,15565701.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628528.25,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690738.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.92,0.0
75%,15753233.75,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.25,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


### Excluo a coluna que não vou precisar para o modelo

In [6]:
df.drop(['customer_id'], axis = 1, inplace=True)

### Verifico se a coluna foi excluida

In [7]:
df.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Vejo se tem algum valor nulo

In [8]:
df.isnull().sum()

credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [9]:
df.country.unique()

array(['France', 'Spain', 'Germany'], dtype=object)

### Classifico os dados de país em numérico

In [10]:
df['country'] = df['country'].map({'France' : 0, 'Spain' : 1, 'Germany' : 2})

### Classifico os dados de sexo em numérico

In [11]:
df['gender'] = df['gender'].map({'Male' : 0, 'Female' : 1})

### Converto a coluna BALANCE para INT

In [12]:
df['balance']=df['balance'].astype(int)

### Converto a coluna ESTIMATED_SALARY para INT

In [13]:
df['estimated_salary']=df['estimated_salary'].astype(int)

### Verifico se as alterações foram feita

In [14]:
df.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,0,1,42,2,0,1,1,1,101348,1
1,608,1,1,41,1,83807,1,0,1,112542,0
2,502,0,1,42,8,159660,3,1,0,113931,1
3,699,0,1,39,1,0,2,0,0,93826,0
4,850,1,1,43,2,125510,1,1,1,79084,0


### Separação de dados para TREINO e conjunto de TESTE

In [15]:
# Retiro a target do dados de entrada
x = df.drop(columns='churn')

# Coloco nos dados de saida
y = df['churn']

In [16]:
# Confiro se a alteração foi realizada
print(x.shape)
x.head()

(10000, 10)


Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
0,619,0,1,42,2,0,1,1,1,101348
1,608,1,1,41,1,83807,1,0,1,112542
2,502,0,1,42,8,159660,3,1,0,113931
3,699,0,1,39,1,0,2,0,0,93826
4,850,1,1,43,2,125510,1,1,1,79084


In [17]:
# Confiro se a alteração foi realizada
print(y.shape)
y.head()

(10000,)


0    1
1    0
2    1
3    0
4    0
Name: churn, dtype: int64

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 7, test_size = 0.20, shuffle = True, stratify = y)

In [19]:
# Parâmetros e partições da validação cruzada
scoring = 'accuracy'
num_particoes = 10
kfold = StratifiedKFold(n_splits = num_particoes, shuffle = True, random_state = 7) # validação cruzada com estratificação

### Criação, avaliação e comparação dos modelos (Modelagem e Inferência)

Esta sessão cria, avalia e compara vários modelos de classificação utilizando validação cruzada e pipelines de escalonamento para selecionar o melhor desempenho.

In [None]:
modelos = []

# Criação dos modelos e adicionando-os na lista de modelos com pipeline (Escalonamento)
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))



###### FIM DO CÓDIGO PARA O PROJETO ####################

In [None]:
x_train

In [None]:
xgb=XGBClassifier(n_estimators=480, learning_rate=0.2, max_depth=8)

In [None]:
xgb.fit(x_train,y_train)

In [42]:
xgb_y_pred = xgb.predict(x_test)

In [47]:
print("Accuracy:",accuracy_score(y_test, xgb_y_pred)*100)

Accuracy: 86.3


### o StandardScaler() é uma maneira simples e eficaz de colocar todas as variáveis na mesma escala, garantindo que o modelo não favoreça variáveis com maior magnitude

#### fit_transform(X_train): Ajusta o transformador com base no conjunto de treino e, ao mesmo tempo, aplica a transformação nos dados de treino.
#### transform(X_test): Apenas aplica a transformação ao conjunto de teste, usando os parâmetros aprendidos a partir do treino.

In [None]:
sc= StandardScaler()
x_train_rescaled = sc.fit_transform(x_train)
x_test_rescaled = sc.transform(x_test)