# Diabetes prediction solution

## Random forest model

In [2]:
import pandas as pd

train_data = pd.read_csv("/workspaces/machine-learning-python-template/data/processed/clean_train.csv")
test_data = pd.read_csv("/workspaces/machine-learning-python-template/data/processed/clean_test.csv")

train_data.head()



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,Pregnancies,Glucose,BMI,Age,Outcome
0,2.0,84.0,0.0,21.0,0
1,9.0,112.0,28.2,50.0,1
2,1.0,139.0,28.7,22.0,0
3,0.0,161.0,21.9,65.0,0
4,6.0,134.0,46.2,46.0,1


In [3]:
X_train = train_data.drop(["Outcome"], axis = 1)
y_train = train_data["Outcome"]
X_test = test_data.drop(["Outcome"], axis = 1)
y_test = test_data["Outcome"]

In [4]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 60, random_state = 42)
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [6]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7532467532467533

## **Model optimization**

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Definir el modelo
rf_classifier = RandomForestClassifier(random_state=42)

# Definir la cuadrícula de búsqueda de hiperparámetros
param_grid = {
    "criterion": ["gini", "entropy"],
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Configurar la búsqueda de cuadrícula
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Ajustar el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtener los mejores hiperparámetros
best_params = grid_search.best_params_
print("Mejores hiperparámetros:", best_params)



Mejores hiperparámetros: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


In [8]:
# Definir los mejores hiperparámetros con 'max_features' corregido
best_params = {'criterion': 'entropy','max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}

# Crear un nuevo modelo RandomForestClassifier con los mejores hiperparámetros
rf_classifier_best = RandomForestClassifier(**best_params)

# Ajustar el modelo a los datos de entrenamiento
rf_classifier_best.fit(X_train, y_train)

# Realizar predicciones en los datos de prueba
y_pred = rf_classifier_best.predict(X_test)


In [9]:
accuracy_score(y_test, y_pred)

0.7792207792207793

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Definir el modelo
rf_classifier = RandomForestClassifier()

# Definir la cuadrícula de búsqueda de hiperparámetros
param_grid = {
    "n_estimators": [50, 100, 150],
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "min_weight_fraction_leaf": [0.0],
    "max_features": ['sqrt', 'log2'],
    "max_leaf_nodes": [None],
    "min_impurity_decrease": [0.0],
    "bootstrap": [True],
    "oob_score": [False],
    "n_jobs": [None],
    "random_state": [None],
    "verbose": [0],
    "warm_start": [False],
    "class_weight": [None],
    "ccp_alpha": [0.0],
    "max_samples": [None],
    "monotonic_cst": [None]
}

# Configurar la búsqueda de cuadrícula
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Ajustar el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtener los mejores hiperparámetros
best_params = grid_search.best_params_
print("Mejores hiperparámetros:", best_params)


Mejores hiperparámetros: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [36]:


# Obtener los mejores hiperparámetros
best_params = grid_search.best_params_

# Crear un nuevo modelo RandomForestClassifier con los mejores hiperparámetros
rf_classifier_best = RandomForestClassifier(**best_params)

# Ajustar el modelo a los datos de entrenamiento
rf_classifier_best.fit(X_train, y_train)

# Realizar predicciones en los datos de prueba
y_pred = rf_classifier_best.predict(X_test)


In [37]:
from sklearn.metrics import accuracy_score

# Realizar predicciones en los datos de prueba
y_pred = rf_classifier_best.predict(X_test)

# Calcular la precisión
accuracy = accuracy_score(y_test, y_pred)

# Imprimir la precisión
print(f'Precisión del modelo: {accuracy:}')


Precisión del modelo: 0.7857142857142857


In [38]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import subprocess
import os

# Directorio donde se guardarán las imágenes .png
png_directory = "arboles"
os.makedirs(png_directory, exist_ok=True)
# Convertir archivos .dot a imágenes .png
for tree_number, tree in enumerate(model.estimators_):
    # Visualize the decision tree using Graphviz
    dot_data = export_graphviz(tree, out_file=None,
                               feature_names=X_train.columns,
                               class_names = list(map(str, y_train.unique())),
                               filled=True, rounded=True,
                               special_characters=True)

    # Guardar cada árbol como un archivo .dot
    dot_filename = f"tree_{tree_number}.dot"
    dot_path = os.path.join(png_directory, dot_filename)
    with open(dot_path, 'w') as dot_file:
        dot_file.write(dot_data)

    # Convertir .dot a .png
    png_filename = f"tree_{tree_number}.png"
    png_path = os.path.join(png_directory, png_filename)
    #subprocess.run(["dot", "-Tpng", dot_path, "-o", png_path])
    print(f"Tree {tree_number} saved as {png_filename}")

Tree 0 saved as tree_0.png
Tree 1 saved as tree_1.png
Tree 2 saved as tree_2.png
Tree 3 saved as tree_3.png
Tree 4 saved as tree_4.png
Tree 5 saved as tree_5.png
Tree 6 saved as tree_6.png
Tree 7 saved as tree_7.png
Tree 8 saved as tree_8.png
Tree 9 saved as tree_9.png
Tree 10 saved as tree_10.png
Tree 11 saved as tree_11.png
Tree 12 saved as tree_12.png
Tree 13 saved as tree_13.png
Tree 14 saved as tree_14.png
Tree 15 saved as tree_15.png
Tree 16 saved as tree_16.png
Tree 17 saved as tree_17.png
Tree 18 saved as tree_18.png
Tree 19 saved as tree_19.png
Tree 20 saved as tree_20.png
Tree 21 saved as tree_21.png
Tree 22 saved as tree_22.png
Tree 23 saved as tree_23.png
Tree 24 saved as tree_24.png
Tree 25 saved as tree_25.png
Tree 26 saved as tree_26.png
Tree 27 saved as tree_27.png
Tree 28 saved as tree_28.png
Tree 29 saved as tree_29.png
Tree 30 saved as tree_30.png
Tree 31 saved as tree_31.png
Tree 32 saved as tree_32.png
Tree 33 saved as tree_33.png
Tree 34 saved as tree_34.png
Tree 

The base random forest is better than the optimized decision tree.

In [54]:
from pickle import dump

dump(model, open("../models/ranfor_classifier_max_depth_10max_features_sqrt_min_samples_leaf_4min_samples_split_10n_estimators_50.sav", "wb"))