Esta es una adaptación de un modelo modilarizado que se puede encontra documentado en: https://github.com/ManuelLagunas/Telecom.git

In [None]:
# Libraries ----------------------------------------

import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [None]:
# Loading data ----------------------------------------

contract_raw = pd.read_csv("/datasets/final_provider/contract.csv")
internet_raw = pd.read_csv("/datasets/final_provider/internet.csv")
personal_raw = pd.read_csv("/datasets/final_provider/personal.csv")
phone_raw = pd.read_csv("/datasets/final_provider/phone.csv")

## Corrección de errores

In [None]:
# Correct name columns ----------------------------------------

def snake_case_columns(df):
    df.columns = df.columns.map(lambda x: re.sub(r'(?<=[a-z])(?=[A-Z])', '_', x).lower())
    return df

contract_df = snake_case_columns(contract_raw)
internet_df = snake_case_columns(internet_raw)
personal_df = snake_case_columns(personal_raw)
phone_df = snake_case_columns(phone_raw)

In [None]:
# Correct columns dtype ----------------------------------------

#-------- contract_df --------
contract_df['begin_date'] = pd.to_datetime(contract_df['begin_date'])
contract_df['begin_date'] = (contract_df['begin_date'] - contract_df['begin_date'].min()).dt.days
contract_df['end_date'] = contract_df['end_date'].apply(lambda x: 0 if x == 'No' else 1)
contract_df = pd.get_dummies(contract_df, columns=['type'], prefix='type', drop_first=True)
contract_df = pd.get_dummies(contract_df, columns=['paperless_billing'], prefix='paperless_billing', drop_first=True)
contract_df = pd.get_dummies(contract_df, columns=['payment_method'], prefix='payment_method', drop_first=True)
contract_df['total_charges'] = pd.to_numeric(contract_df['total_charges'], errors='coerce')
# contract_df.info()

In [None]:
#-------- internet_df --------
cols_to_encode = [col for col in internet_df.columns if col not in ['customer_id']]
internet_df = pd.get_dummies(internet_df, columns=cols_to_encode, drop_first=True)
# internet_df.info()

In [None]:
#-------- personal_df --------
cols_to_encode = [col for col in personal_df.columns if col not in ['customer_id', 'senior_citizen']]
personal_df = pd.get_dummies(personal_df, columns=cols_to_encode, drop_first=True)
personal_df['senior_citizen'] = personal_df['senior_citizen'].astype('bool')
# personal_df.info()

In [None]:
#-------- phone_df --------
phone_df = pd.get_dummies(phone_df, columns=['multiple_lines'], prefix='multiple_lines', drop_first=True)
# phone_df.info()

## Fusión de dataframes

In [None]:
# Dataframes fusion ----------------------------------------

merged_df = pd.merge(contract_df, internet_df, on='customer_id', how='outer')
merged_df = pd.merge(merged_df, personal_df, on='customer_id', how='outer')
merged_df = pd.merge(merged_df, phone_df, on='customer_id', how='outer')

In [None]:
# Fill missing values ----------------------------------------

merged_df['total_charges'].dropna(inplace=True)
merged_df.fillna(False, inplace=True)

In [None]:
# Correct columns dtype ----------------------------------------

# merged_df['begin_date'] = pd.to_datetime(merged_df['begin_date'])
merged_df['end_date'] = pd.to_numeric(merged_df['end_date'], errors='coerce')
merged_df['total_charges'] = pd.to_numeric(merged_df['total_charges'], errors='coerce')

## Estudio de desbalanceo de clases

In [None]:
# Check unbalanced data ----------------------------------------

# Count the number of 1s and 0s in the 'end_date' column
count_1 = merged_df['end_date'].eq(1).sum()
count_0 = merged_df['end_date'].eq(0).sum()

# Print the counts
print("Number of 1s:", count_1)
print("Number of 0s:", count_0)

In [None]:
# Plotting data ----------------------------------------
sns.countplot(data=merged_df, x='end_date')
plt.show()

In [None]:
# Calculate the percentage of 1s and 0s in the 'end_date' column
percentage_1 = count_1 / len(merged_df) * 100
percentage_0 = count_0 / len(merged_df) * 100

# Print the percentages
print("Percentage of 1s:", percentage_1)
print("Percentage of 0s:", percentage_0)

In [None]:
# Calculate the ratio of 0s to 1s in the 'end_date' column
ratio_0_to_1 = count_0 / count_1
# Print the ratio
print("Ratio of 0s to 1s:", ratio_0_to_1)

## Creación de características y objetivo

In [None]:
# features creation ----------------------------------------

# df.columns
features = merged_df.drop(columns=['customer_id', 'end_date'], axis=1)

In [None]:
# target creation ----------------------------------------

target = merged_df['end_date']

## Creación de conjunto de datos

In [None]:
# Split data ----------------------------------------

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.25, random_state=42)

## Sobremuestreo

In [None]:
# Upsampler creation ---------------------------------------- 

def upsample(features, target):
    # Convert target to a pandas Series if it's a one-column DataFrame
    if isinstance(target, pd.DataFrame):
        target = target.squeeze()

    # Combine features and target
    df = pd.concat([features, target], axis=1)

    # Class separation
    df_majority = df[target==0]
    df_minority = df[target==1]

    # Upsampling the minority class
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     
                                     n_samples=len(df_majority),    
                                     random_state=123) 

    # Combine the majority class with the minority upsampled class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    # Shuffle the data
    df_upsampled = df_upsampled.sample(frac=1, random_state=123)

    # Separate features and target
    features_upsampled = df_upsampled.drop(target.name, axis=1)
    target_upsampled = df_upsampled[target.name]

    return features_upsampled, target_upsampled

In [None]:
# Upsampling ----------------------------------------

features_train_upsampled, target_train_upsampled = upsample(features_train, target_train)

## Submuestreo

In [None]:
# Downsampler creation ----------------------------------------

def downsample(features, target):
    # Convert target to a pandas Series if it's a one-column DataFrame
    if isinstance(target, pd.DataFrame):
        target = target.squeeze()

    # Combine features and target
    df = pd.concat([features, target], axis=1)

    # Class separation
    df_majority = df[target==0]
    df_minority = df[target==1]

    # Calculate the fraction of majority samples to keep
    fraction = len(df_minority) / len(df_majority)

    # Downsample the majority class
    df_majority_downsampled = df_majority.sample(frac=fraction, random_state=123)

    # Combine the minority class with the downsampled majority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])

    # Shuffle the data
    df_downsampled = df_downsampled.sample(frac=1, random_state=123)

    # Separate features and target
    features_downsampled = df_downsampled.drop(target.name, axis=1)
    target_downsampled = df_downsampled[target.name]

    return features_downsampled, target_downsampled

In [None]:
# Downsampling ----------------------------------------

features_train_downsampled, target_train_downsampled = downsample(features_train, target_train)

## Modelos de control

In [None]:
# Tuning models ----------------------------------------

# ---------------- Logistic Regression ----------------
# Define the hyperparameters grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Create the logistic regression model
logreg = LogisticRegression()

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=logreg, param_grid=param_grid, scoring='roc_auc', cv=5)

# Fit the model to the training data
grid_search.fit(features_train, target_train)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params)
print("Best AUC-ROC Score:", best_score)

In [None]:
# ---------------- Random Forest ----------------
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create the random forest model
rf = RandomForestClassifier()

# Create scorer
auc_roc_scorer = make_scorer(roc_auc_score)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, scoring=auc_roc_scorer, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(features_train, target_train)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params)
print("Best AUC-ROC Score:", best_score)

In [None]:
# ---------------- LightGBM ----------------
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [10, 20, 30],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [None, 10, 20, 30],
    'num_leaves': [31, 62, 93],
    'min_child_samples': [20, 30, 40]
}

# Create the LightGBM model
lgbm = LGBMClassifier()

# Create scorer
auc_roc_scorer = make_scorer(roc_auc_score)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, scoring=auc_roc_scorer, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(features_train, target_train)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params)
print("Best AUC-ROC Score:", best_score)

## Modelos con balanceo

In [None]:
# Tuning models ----------------------------------------

# ---------------- Logistic Regression ----------------
# Define the hyperparameters grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced']
}

# Create the logistic regression model
logreg = LogisticRegression()

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=logreg, param_grid=param_grid, scoring='roc_auc', cv=5)

# Fit the model to the training data
grid_search.fit(features_train, target_train)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params)
print("Best AUC-ROC Score:", best_score)

In [None]:
# ---------------- Random Forest ----------------
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Create the random forest model
rf = RandomForestClassifier()

# Create scorer
auc_roc_scorer = make_scorer(roc_auc_score)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, scoring=auc_roc_scorer, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(features_train, target_train)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params)
print("Best AUC-ROC Score:", best_score)

In [None]:
# ---------------- LightGBM ----------------
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [10, 20, 30],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [None, 10, 20, 30],
    'num_leaves': [31, 62, 93],
    'min_child_samples': [20, 30, 40],
    'class_weight': ['balanced']
}

# Create the LightGBM model
lgbm = LGBMClassifier()

# Create scorer
auc_roc_scorer = make_scorer(roc_auc_score)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, scoring=auc_roc_scorer, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(features_train, target_train)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params)
print("Best AUC-ROC Score:", best_score)

# Create the best model ----------------------------------------

lgbm_balanced = LGBMClassifier(**best_params)
lgbm_balanced.fit(features_train, target_train)

## Modelos con sobremuestreo

In [None]:
# Tuning models ----------------------------------------

# ---------------- Logistic Regression ----------------
# Define the hyperparameters grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Create the logistic regression model
logreg = LogisticRegression()

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=logreg, param_grid=param_grid, scoring='roc_auc', cv=5)

# Fit the model to the training data
grid_search.fit(features_train_upsampled, target_train_upsampled)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params_lr = grid_search.best_params_
best_score_lr = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params_lr)
print("Best AUC-ROC Score:", best_score_lr)

In [None]:
# ---------------- Random Forest ----------------
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create the random forest model
rf = RandomForestClassifier()

# Create scorer
auc_roc_scorer = make_scorer(roc_auc_score)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, scoring=auc_roc_scorer, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(features_train_upsampled, target_train_upsampled)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params_rf = grid_search.best_params_
best_score_rf = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params_rf)
print("Best AUC-ROC Score:", best_score_rf)

In [None]:
# Create the LightGBM model
lgbm = LGBMClassifier()

# Create scorer
auc_roc_scorer = make_scorer(roc_auc_score)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, scoring=auc_roc_scorer, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(features_train_upsampled, target_train_upsampled)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params_lgbm = grid_search.best_params_
best_score_lgbm = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params_lgbm)
print("Best AUC-ROC Score:", best_score_lgbm)

In [None]:
# Create the best model ----------------------------------------

lgbm_upsampled = LGBMClassifier(**best_params_lgbm)
lgbm_upsampled.fit(features_train_upsampled, target_train_upsampled)

## Modelos con submuestreo

In [None]:
# ---------------- Logistic Regression ----------------
# Define the hyperparameters grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Create the logistic regression model
logreg = LogisticRegression()

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=logreg, param_grid=param_grid, scoring='roc_auc', cv=5)

# Fit the model to the training data
grid_search.fit(features_train_downsampled, target_train_downsampled)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params_lr = grid_search.best_params_
best_score_lr = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params_lr)
print("Best AUC-ROC Score:", best_score_lr)

In [None]:
# ---------------- Random Forest ----------------
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create the random forest model
rf = RandomForestClassifier()

# Create scorer
auc_roc_scorer = make_scorer(roc_auc_score)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, scoring=auc_roc_scorer, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(features_train_downsampled, target_train_downsampled)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params_rf = grid_search.best_params_
best_score_rf = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params_rf)
print("Best AUC-ROC Score:", best_score_rf)

In [None]:
# Create the LightGBM model
lgbm = LGBMClassifier()

# Create scorer
auc_roc_scorer = make_scorer(roc_auc_score)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, scoring=auc_roc_scorer, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(features_train_downsampled, target_train_downsampled)

# Get the best hyperparameters and the corresponding AUC-ROC score
best_params_lgbm = grid_search.best_params_
best_score_lgbm = grid_search.best_score_

# Print the best hyperparameters and the corresponding AUC-ROC score
print("Best Hyperparameters:", best_params_lgbm)
print("Best AUC-ROC Score:", best_score_lgbm)

## Evaluación del modelo

In [None]:
# Model application ----------------------------------------

predictions = lgbm_upsampled.predict(features_test)
auc_roc = roc_auc_score(target_test, predictions)
print("AUC-ROC:", auc_roc)

In [None]:
# AUC-ROC graph ----------------------------------------

fpr, tpr, thresholds = roc_curve(target_test, predictions)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.show()