In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import optuna
from lightgbm import LGBMClassifier

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 600)

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [8]:
# 1. Feature de derivadas

# Taxa de gastos (balance / tenure)
df['Gastos_Anuais'] = df['Balance'] / df['Tenure']

# Proporção de produtos por CreditScore
df['Prop_Produtos_Score'] = df['NumOfProducts'] / df['CreditScore']

# 2. Feature discretizadas

# Faixa etária (exemplos de intervalos)
def get_faixa_etaria(age):
  if age <= 30:
    return '20-30'
  elif age <= 40:
    return '31-40'
  else:
    return '40+'

df['Faixa_Etaria'] = df['Age'].apply(get_faixa_etaria)

# Nível de saldo (exemplos de intervalos)
def get_nivel_saldo(balance):
  if balance <= 10000:
    return 'Baixo'
  elif balance <= 50000:
    return 'Medio'
  else:
    return 'Alto'

df['Nivel_Saldo'] = df['Balance'].apply(get_nivel_saldo)

# 3. Feature codificadas

# Geography (One-Hot Encoding)
df_encoded = pd.get_dummies(df, columns=['Geography'])
df = pd.concat([df, df_encoded], axis=1)
df.drop('Geography', axis=1, inplace=True)

# Gender (Label Encoding)
gender_mapping = {'Male': 0, 'Female': 1}
df['Gender'] = df['Gender'].map(gender_mapping)

# 4. Feature combinadas

# Interação entre produtos e cartão de crédito
df['Prod_Cartao'] = df['HasCrCard'] * df['NumOfProducts']

# 5. Tratamento de valores ausentes

# Verifique se há valores ausentes (opcional)
print(df.isnull().sum())

# Preencha os valores ausentes (exemplo: com a média)
df['Balance'].fillna(df['Balance'].mean(), inplace=True)

# Salve o dataset com as features criadas
df.to_csv('churn_data_engineered.csv', index=False)



TypeError: the first argument must be callable