In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv("/home/ander/Documentos/Universidad/ML/proyecto/train_processed.csv")

def one_hot_encode_categorical(df, column):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_column = encoder.fit_transform(df[[column]])    
    encoded_df = pd.DataFrame(encoded_column, columns=encoder.get_feature_names_out([column]))
    df = pd.concat([df, encoded_df], axis=1)
    df = df.drop(column, axis=1)
    return df, encoder

categorical_columns = ['Geography']
df = df.copy()  

encoders = {}
for column in categorical_columns:
    df, enconder = one_hot_encode_categorical(df, column)
    encoders[column] = enconder
    
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})


In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

X = df.drop(columns=["Exited"])
y = df["Exited"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
from tpot import TPOTClassifier
import os


os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'


model = TPOTClassifier(
    generations=60,
    population_size=20,
    random_state=52,
    verbose=2,
    n_jobs=8,
    cv=5,
    max_time_mins=30,
)
model.fit(X_train, y_train)



In [None]:
print("Best pipeline:", model.fitted_pipeline_)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

from sklearn.metrics import roc_curve, auc
import pandas as pd

# Ejemplo para un modelo (ajusta el nombre y el modelo según corresponda)
y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Guardar en CSV
roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds})
roc_df['auc'] = roc_auc
roc_df.to_csv('roc_modelo.csv', index=False)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.5f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - TPOT Model')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
#Submit predictions

test = pd.read_csv("/home/cristian/courses/machine_learning_UNAL/customer_churn_prediction/data/raw/test.csv")
test1 = test.drop(columns=["Surname", "id", "CustomerId"])

test_encoded = test1.copy()
for column in categorical_columns:
    one = encoders[column]
    encoded = one.transform(test_encoded[[column]])
    encoded_df = pd.DataFrame(encoded, columns=one.get_feature_names_out([column]), index=test_encoded.index)
    test_encoded = pd.concat([test_encoded.drop(column, axis=1), encoded_df], axis=1)

# Gender column
test_encoded['Gender'] = test_encoded['Gender'].map({'Male': 0, 'Female': 1})
predictions = model.predict_proba(test_encoded)[:, 1]  

submission = pd.DataFrame({
    'id': test['id'],
    'Exited': predictions
})

submission.to_csv("submission_tpot2.csv", index=False)
print("Submission saved to tpot_submission.csv")


In [None]:
model.export('tpot_best_pipeline.py')