# Scikit- Learn

- **Estimadores**: StandarScaler, LogisticRegression
- **Transformadores**: StandarScaler.transform
- **Predictores**: LogisticRegression.predict

In [2]:
# Codigo de ejemplo
# Librerias
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
# Cargamos el dataset
data = load_iris()
X = data.data
y = data.target

In [4]:
# Dividmos el dataset en conjunto de entrenamiento y prueba
X_entrena, X_prueba, y_entrena, y_prueba = train_test_split(
    X, y, test_size=0.25, random_state=0
)

In [5]:
# Creamos una instancia del escalador
scaler = StandardScaler()

In [6]:
# Estimador (StandardScaler): Aprendemos los parametros deescalado fit
scaler.fit(X_entrena)

In [7]:
# Transformador (StandardScaler): Aplicamos la transformacion a los datos de entrenamiento y prueba
X_entrena_escalado = scaler.transform(X_entrena)
X_prueba_escalado = scaler.transform(X_prueba)

In [8]:
# Creamos una instancia del modelo
modelo = LogisticRegression()

In [9]:
# Estimador (LogisticRegression): Entrenamos el modelo con los datos escalados
modelo.fit(X_entrena_escalado, y_entrena)

In [10]:
# Predictor (LogisticRegression): Hacemos predicciones y evaluamos el modelo
y_pred = modelo.predict(X_prueba_escalado)
puntaje = modelo.score(X_prueba_escalado, y_prueba)
print(f'Las predicciones son: {y_pred}')
print(f'La precision del modelo es: {puntaje: .2f}')

Las predicciones son: [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
La precision del modelo es:  0.97


# Preprocesamiento de datos

In [11]:
import numpy as np 
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

In [12]:
data = np.array([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
data

array([[ 1, -1,  2],
       [ 2,  0,  0],
       [ 0,  1, -1]])

In [13]:
scaler = MinMaxScaler(feature_range=(0, 1))
data_escalada = scaler.fit_transform(data)
data_escalada

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [14]:
iris = load_iris()
X = iris.data

In [15]:
X_escalada = scaler.fit_transform(X)

In [16]:
print(X[:5])
print(X_escalada[:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[[0.22222222 0.625      0.06779661 0.04166667]
 [0.16666667 0.41666667 0.06779661 0.04166667]
 [0.11111111 0.5        0.05084746 0.04166667]
 [0.08333333 0.45833333 0.08474576 0.04166667]
 [0.19444444 0.66666667 0.06779661 0.04166667]]


In [17]:
scaler2 = StandardScaler()
data_escalada2 = scaler2.fit_transform(data)
data_escalada2


array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [18]:
np.std(data_escalada2)

np.float64(1.0)

In [19]:
categorias = np.array([["rojo"], ["verde"], ["azul"], ["verde"], ["verde"], ["azul"]])
categorias

array([['rojo'],
       ['verde'],
       ['azul'],
       ['verde'],
       ['verde'],
       ['azul']], dtype='<U5')

In [20]:
encoder = OneHotEncoder(sparse_output=False)
data_codificada = encoder.fit_transform(categorias)
data_codificada

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

## Seleccion y division

In [21]:
from sklearn.feature_selection import SelectKBest, SelectFromModel, chi2
from sklearn.ensemble import RandomForestClassifier

In [22]:
data = load_iris()
X = data.data
y = data.target

In [23]:
X_entrena, X_prueba, y_entrenamiento, y_prueba = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [24]:
print('Size del conjunto totla: ', len(X))
print('Size del conjunto entrenamiento: ', len(X_entrena))
print('Size del conjunto pruebas: ', len(X_prueba))

Size del conjunto totla:  150
Size del conjunto entrenamiento:  120
Size del conjunto pruebas:  30


In [39]:
selector = SelectKBest(score_func=chi2, k=2)
X_nuevo = selector.fit_transform(X_entrena, y_entrenamiento)

In [26]:
print(X_entrena[:5])
print(X_nuevo[:5])

[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]]
[[1.  0.2]
 [1.5 0.4]
 [4.4 1.4]
 [1.6 0.2]
 [1.3 0.2]]


In [27]:
modelo = RandomForestClassifier(n_estimators=100, random_state=42)
selector2 = SelectFromModel(modelo)
X_importante = selector2.fit_transform(X_entrena, y_entrenamiento)

In [28]:
print(X_entrena[:5])
print(X_importante[:5])

[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]]
[[1.  0.2]
 [1.5 0.4]
 [4.4 1.4]
 [1.6 0.2]
 [1.3 0.2]]


# Evaluacion de modelos

In [29]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [30]:
data = load_iris()
X =data.data
y = data.target

In [31]:
modelo = RandomForestClassifier(n_estimators=100, random_state=42)

In [32]:
puntaje = cross_val_score(modelo, X, y, cv=5)

In [33]:
print('Exactitud de cada particion: ', puntaje)
print('Media de exactitud: ', puntaje.mean())

Exactitud de cada particion:  [0.96666667 0.96666667 0.93333333 0.96666667 1.        ]
Media de exactitud:  0.9666666666666668


## Ajuste de Hiperparametros

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
parametros = {
    "n_estimators": [50, 100, 200],
    "max_features": ["auto", "sqrt", "log2"],
    "max_depth": [4, 5, 6, 7, 8],
    "criterion": ["gini", "entropy"],
}

In [36]:
mi_grid_search = GridSearchCV(
    estimator=modelo, param_grid=parametros, cv=5, scoring="accuracy"
)

In [37]:
mi_grid_search.fit(X, y)

150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "/home/anderson-josue/anaconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/anderson-josue/anaconda3/lib/python3.13/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/home/anderson-josue/anaconda3/lib/python3.13/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~~~~

In [38]:
print('Mejores parametros: ',mi_grid_search.best_params_)
print('Mejor exactitud: ', mi_grid_search.best_score_)

Mejores parametros:  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 50}
Mejor exactitud:  0.9666666666666668
