# Usando o dataset Bank Marketing do repositório UCI para prever a resposta de clientes a uma campanha de marketing por telefone de um banco.

## O dataset quer prever se o cliente fez ou não o depósito a prazo fixo após a campanha telefônica, com isso, dado o perfil do cliente e as informações da campanha, espera-se prever se o cliente vai contratar o depósito ou não.

In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
  
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 
  
# variable information 
print(bank_marketing.variables) 



{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

# Analisar dados faltantes (ter certeza se não tem '?' '-' ' ')

In [2]:
X.replace({"?": np.nan, "-": np.nan, " ": np.nan})

# Analisar dados faltantes e duplicados
print("Dados faltantes:\n", X.isnull().sum())


Dados faltantes:
 age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
dtype: int64


# Verificar tipos

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          45211 non-null  int64 
 1   job          44923 non-null  object
 2   marital      45211 non-null  object
 3   education    43354 non-null  object
 4   default      45211 non-null  object
 5   balance      45211 non-null  int64 
 6   housing      45211 non-null  object
 7   loan         45211 non-null  object
 8   contact      32191 non-null  object
 9   day_of_week  45211 non-null  int64 
 10  month        45211 non-null  object
 11  duration     45211 non-null  int64 
 12  campaign     45211 non-null  int64 
 13  pdays        45211 non-null  int64 
 14  previous     45211 non-null  int64 
 15  poutcome     8252 non-null   object
dtypes: int64(7), object(9)
memory usage: 5.5+ MB


# Ver features

In [4]:
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,


# Ver target

In [5]:
y.head()

Unnamed: 0,y
0,no
1,no
2,no
3,no
4,no


# Tratar dados nulos categóricos

In [6]:
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Preencher dados nulos com a moda
for col in cat_cols:
    if X[col].isnull().any():
        moda = X[col].mode()[0]
        X.loc[:, col] = X[col].fillna(moda)

# Analisar se ainda tem dados nulos

In [7]:
X.replace({"?": np.nan, "-": np.nan, " ": np.nan})

# Analisar dados faltantes e duplicados
print("Dados faltantes:\n", X.isnull().sum())

Dados faltantes:
 age            0
job            0
marital        0
education      0
default        0
balance        0
housing        0
loan           0
contact        0
day_of_week    0
month          0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
dtype: int64


# Verificar balanceamento

In [8]:
print(y.value_counts(normalize=True) * 100)


y  
no     88.30152
yes    11.69848
Name: proportion, dtype: float64


# Tratamento de variáveis categóricas

In [9]:
# Tratar as features categóricas


# Identifica colunas categóricas e numéricas
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ],
    remainder='passthrough'
)


# y pode ser um DataFrame de uma coluna, então usamos .values.ravel()
if y.dtypes.values[0] == 'object':
    le = LabelEncoder()
    y_encoded = le.fit_transform(y.values.ravel())
else:
    y_encoded = y.values.ravel()

# Aplica o encoder nas features
X_encoded = preprocessor.fit_transform(X)

# Converte o resultado para DataFrame com nomes de colunas
import pandas as pd

# Recupera os nomes das colunas geradas
encoded_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
all_cols = list(encoded_cols) + num_cols

X_encoded_df = pd.DataFrame(X_encoded, columns=all_cols)


# Analisar para ter certeza se as variáveis categóricas foram tratadas

In [10]:
print(X_encoded_df.select_dtypes(include='object').columns.tolist())

[]


# Dividir em treino e teste e balancear

In [11]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_encoded_df, y_encoded)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)



# Treinamento e Avaliação

In [12]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

for name, model in models.items():
    print(f"----- {name} -----")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append({'Model': name, 'Accuracy': acc})

results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)
print(results_df)


----- Random Forest -----
Accuracy: 0.9452
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7908
           1       0.97      0.92      0.94      8061

    accuracy                           0.95     15969
   macro avg       0.95      0.95      0.95     15969
weighted avg       0.95      0.95      0.95     15969

Confusion Matrix:
[[7665  243]
 [ 632 7429]]


----- AdaBoost -----
Accuracy: 0.9169
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      7908
           1       0.94      0.90      0.92      8061

    accuracy                           0.92     15969
   macro avg       0.92      0.92      0.92     15969
weighted avg       0.92      0.92      0.92     15969

Confusion Matrix:
[[7414  494]
 [ 833 7228]]


----- Extra Trees -----
Accuracy: 0.9463
Classification Report:
              precision    recall  f1-score   support

      