# ELECCION DE HIPERPARAMETROS

In [127]:
import requests
from io import StringIO
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score as cv
from sklearn.model_selection import StratifiedKFold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  precision_score

In [128]:
# Enlace al archivo CSV en GitHub
github_link = "https://raw.githubusercontent.com/Martinerramuspe/02-DATA_SCIENCIE/main/01-TELECOM_CHURN_PROJECT/02-PREPROCESAMIENTO/Prepro01.csv"

# Descargar el archivo CSV desde GitHub
response = requests.get(github_link)

# Leer el contenido del CSV
csv_data = StringIO(response.text)

# Crear un DataFrame con los datos
df = pd.read_csv(csv_data, encoding='utf-8')

In [129]:
# Definimos las caracteristicas segun lo ya definido en la etapa de eleccion de modelo.
X= df[[ 'International plan', 'Voice mail plan', 'Number vmail messages',
       'Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl calls']]

y=df.Churn

In [130]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## RANDOM FOREST CON HIPERPARAMETROS PREESTABLECIDOS.

In [131]:
# Instanciamos.
forest_model = RandomForestClassifier()

In [132]:
# Fit.
forest_model.fit(X_train, y_train)

In [133]:
# Observamos los hiperparametros predefinidos por el modelo.
forest_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [134]:
# Presicion en conjunto de entrenamiento.
y_pred_train = forest_model.predict(X_train)
# Determinamos precisión en el conjunto de entrenamiento.
precision_train = precision_score(y_train, y_pred_train, average='binary')  # 'binary' contempla el balance.
precision_train

1.0

In [135]:
# Presicion en conjunto de validacion.
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
forest_cv_scores = cv(forest_model, X_train, y_train, cv=skf)
forest_cv_scores

array([0.95821727, 0.94707521, 0.9637883 , 0.96100279, 0.95251397])

In [136]:
# # Presicion en conjunto de testeo.
y_pred_test = forest_model.predict(X_test)
# Determinamos precisión en el conjunto de test.
precision_test = precision_score(y_test, y_pred_test, average='binary') # 'binary' contempla el balance.
precision_test

0.9714285714285714

## GridSarch CON RANGO DE HIPERPARAMETROS

In [137]:
# Definimos rangos de  hiperparámetros
param_grid = {
    'n_estimators': [75, 100, 150],
    'max_depth': [None, 20, 30],
    'min_samples_split': [1, 2],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True],
    'ccp_alpha': [0.0],
    'class_weight': [None],
    'criterion': ['gini'],
    'max_features': ['sqrt'],
    'max_leaf_nodes': [None],
    'max_samples': [None],
    'min_impurity_decrease': [0.0],
    'min_weight_fraction_leaf': [0.0],
    'n_jobs': [None],
    'oob_score': [False],
    'random_state': [None],
    'verbose': [0],
    'warm_start': [False]
}

In [140]:
# Instanciamos GridSearchCV
grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, cv=5, scoring='f1')  # Utilizamos "f1" para contemplar el balance.

In [141]:
#Fit
grid_search.fit(X_train, y_train)

90 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [142]:
grid_search.best_params_

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 150,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [144]:
mejor_modelo=grid_search.best_estimator_

In [145]:
y_pred = mejor_modelo.predict(X_test)

In [146]:
precision_mejor_modelo = accuracy_score(y_test, y_pred)
print("La precisión del mejor modelo es:", precision_mejor_modelo)

La precisión del mejor modelo es: 0.9732739420935412


In [30]:
grid_search.score(X_train, y_train)

1.0

In [32]:
print("Resultados de validación Stratified - Random Forest:")
print(forest_cv_scores)
print(f"Precisión media: {forest_cv_scores.mean()}\n")

Resultados de validación Stratified - Random Forest:
[0.71428571 0.84848485 0.84375    0.8        0.71428571]
Precisión media: 0.7841612554112555



In [33]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
forest_cv_scores = cv(forest_model, X_train, y_train, cv=skf)

In [34]:
print("Resultados de validación Stratified - Random Forest:")
print(forest_cv_scores)
print(f"Precisión media: {forest_cv_scores.mean()}\n")

Resultados de validación Stratified - Random Forest:
[0.95548961 0.96735905 0.97916667 0.9672619  0.95238095]
Precisión media: 0.9643316376995902



In [39]:
#presicion train
forest_model = RandomForestClassifier()
forest_model.fit(X_train,y_train)
y_pred=forest_model.predict(X_train)
accuracy_score(y_train, y_pred)

1.0

In [40]:
#presicion test
y_pred1=forest_model.predict(X_test)
accuracy_score(y_test, y_pred1)

0.9500891265597148

In [None]:
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)


In [35]:
best_model = grid_search.best_estimator_
best_model.score(X_train, y_train)

1.0

In [36]:
resultado=pd.DataFrame(grid_search.cv_results_)
resultado

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,...,param_warm_start,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001149,0.000565,0.0,0.0,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",,,,,,,,19
1,0.000706,2.3e-05,0.0,0.0,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",,,,,,,,19
2,0.000786,0.000177,0.0,0.0,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",,,,,,,,19
3,0.273891,0.006595,0.011571,0.001034,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.741935,0.758621,0.724138,0.825397,0.84375,0.778768,0.04721,13
4,0.339246,0.006716,0.013159,0.001211,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.758621,0.779661,0.766667,0.830769,0.892308,0.805605,0.050095,3
5,0.509195,0.011822,0.018145,0.00232,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.745763,0.8,0.714286,0.830769,0.857143,0.789592,0.052838,9
6,0.001087,0.000259,0.0,0.0,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",,,,,,,,19
7,0.000922,0.000109,0.0,0.0,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",,,,,,,,19
8,0.000699,1.5e-05,0.0,0.0,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",,,,,,,,19
9,0.326594,0.048855,0.013823,0.002273,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.724138,0.736842,0.714286,0.819672,0.852941,0.769576,0.055951,16


In [37]:
resultado.sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,...,param_warm_start,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,0.25114,0.008644,0.010875,0.000803,True,0.0,,gini,30.0,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.733333,0.819672,0.724138,0.878788,0.895522,0.810291,0.071259,1
17,0.524248,0.011305,0.019357,0.002682,True,0.0,,gini,20.0,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.733333,0.8,0.724138,0.865672,0.909091,0.806447,0.072395,2
4,0.339246,0.006716,0.013159,0.001211,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.758621,0.779661,0.766667,0.830769,0.892308,0.805605,0.050095,3
15,0.258696,0.009337,0.010847,0.000347,True,0.0,,gini,20.0,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.786885,0.8,0.736842,0.818182,0.875,0.803382,0.044854,4
29,0.493653,0.011725,0.018313,0.001721,True,0.0,,gini,30.0,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.733333,0.8,0.736842,0.830769,0.909091,0.802007,0.065209,5
28,0.336644,0.00508,0.012641,0.00046,True,0.0,,gini,30.0,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.745763,0.779661,0.736842,0.806452,0.909091,0.795562,0.061949,6
34,0.43107,0.072202,0.015541,0.002369,True,0.0,,gini,30.0,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.724138,0.819672,0.724138,0.825397,0.875,0.793669,0.059944,7
16,0.348214,0.015204,0.013476,0.000504,True,0.0,,gini,20.0,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.745763,0.779661,0.724138,0.806452,0.892308,0.789664,0.058547,8
5,0.509195,0.011822,0.018145,0.00232,True,0.0,,gini,,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.745763,0.8,0.714286,0.830769,0.857143,0.789592,0.052838,9
21,0.327572,0.064982,0.012955,0.002229,True,0.0,,gini,20.0,sqrt,...,False,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.711864,0.758621,0.724138,0.875,0.875,0.788925,0.071933,10
