In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import gradio as gr

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
df = pd.read_csv('instances/bank.csv')  

In [None]:
important_features = [
    'age', 'job', 'marital', 'education', 'balance',
    'housing', 'loan', 'contact', 'duration', 'campaign', 'deposit'  # Incluindo o target
]

df_reduced = df[important_features]

df_reduced.to_csv('bank_reduced.csv', index=False)



In [None]:
df = pd.read_csv('instances/bank_reduced.csv')  
df.head()


In [None]:
# Informações gerais
df.info()

# Distribuição do target
sns.countplot(x='deposit', data=df)
plt.title('Distribuição do Target (deposit)')
plt.show()

# Exemplo de análise por marital status
sns.countplot(x='marital', hue='deposit', data=df)
plt.title('Estado civil vs Depósito')
plt.show()


In [None]:
# Codificação de variáveis categóricas
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])

# Separar features e target
X = df.drop('deposit', axis=1)
y = df['deposit']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [None]:
# Modelo 1: Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

# Modelo 2: Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


In [None]:
def evaluate_model(model, X, y, n_splits=30):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    
    for train_index, test_index in skf.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_test_fold)
        scores.append(accuracy_score(y_test_fold, y_pred))
    
    return np.mean(scores), np.std(scores)

mean_dt, std_dt = evaluate_model(dt, X, y)
mean_rf, std_rf = evaluate_model(rf, X, y)

print(f"Decision Tree: Mean={mean_dt:.4f}, Std={std_dt:.4f}")
print(f"Random Forest: Mean={mean_rf:.4f}, Std={std_rf:.4f}")


In [None]:
# Salvar o melhor modelo
with open('best_model.pkl', 'wb') as f:
    pickle.dump(rf, f)  


In [None]:
def predict_bank(age, job, marital, education, balance, housing, loan, contact, duration):
    input_data = pd.DataFrame([[age, job, marital, education, balance, housing, loan, contact, duration]], 
                            columns=['age', 'job', 'marital', 'education', 'balance', 'housing', 'loan', 'contact', 'duration'])
    input_data = label_encoder.transform(input_data)
    prediction = rf.predict(input_data)
    return 'Yes' if prediction[0] == 1 else 'No'

gr.Interface(fn=predict_bank, 
            inputs=["number", "text", "text", "text", "number", "text", "text", "text", "number"], 
            outputs="text").launch()
