# Minicurso Análisis de Datos usando Python
## Parte 5 - Introducción al Aprendizaje Automático usando Python

In [5]:
import numpy as np

### Cargando los datos (características y etiquetas)

In [13]:
X = np.load('X.npy')
X_transf = np.load('X_transf.npy')
y = np.load('y.npy')

### Entrenando y evaluando el modelo

In [14]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [15]:
import sklearn
sklearn.__version__

'0.24.1'

In [17]:
num_folds = 10
seed = 7

i = 0
kfold = model_selection.KFold(n_splits=num_folds, random_state=seed)
accuracy = np.zeros(num_folds)
for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))
    
    # TREINO
    model = LogisticRegression()
    model.fit(X[train_idx], y[train_idx])    
    
    # FAZER PREDIÇÃO A PARTIR DO MODELO TREINADO
    y_pred = model.predict(X[test_idx])
    
    # AVALIAR A TÉCNICA ATRAVÉS DA COMPARAÇÃO DO DADO REAL COM O DADO PREVISTO 
    accuracy[i] = accuracy_score(y[test_idx], y_pred)
    i += 1

print(accuracy)
print(np.average(accuracy), np.std(accuracy))

ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.

In [10]:
# 0.76951469583 0.0484105192457

#### De forma máa automatizada y compacta

In [11]:
reload(model_selection)

NameError: name 'reload' is not defined

In [12]:
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

NameError: name 'model' is not defined

In [12]:
# Accuracy: 0.7669685577580314 0.03542589693856446

In [7]:
results = model_selection.cross_val_score(model, X_transf, y, cv=kfold)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

[ 0.68831169  0.83116883  0.76623377  0.7012987   0.77922078  0.79220779
  0.84415584  0.83116883  0.76315789  0.80263158]
Accuracy: 0.779955570745 0.050088006076


### Usando otra técnica

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

In [9]:
model2 = GradientBoostingClassifier()

In [10]:
results = model_selection.cross_val_score(model2, X, y, cv=kfold)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

[ 0.74025974  0.83116883  0.74025974  0.63636364  0.79220779  0.79220779
  0.80519481  0.83116883  0.72368421  0.78947368]
Accuracy: 0.768198906357 0.0564162027509


In [11]:
results = model_selection.cross_val_score(model2, X_transf, y, cv=kfold)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

[ 0.74025974  0.83116883  0.72727273  0.63636364  0.79220779  0.79220779
  0.80519481  0.83116883  0.71052632  0.78947368]
Accuracy: 0.765584415584 0.0583081884459


### Cambio de hiperparámetros (ajuste de algoritmos)

In [12]:
model3 = GradientBoostingClassifier(n_estimators=50, max_depth=3, learning_rate=0.05)
results = model_selection.cross_val_score(model3, X_transf, y, cv=kfold)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

[ 0.71428571  0.85714286  0.72727273  0.63636364  0.79220779  0.81818182
  0.81818182  0.85714286  0.73684211  0.78947368]
Accuracy: 0.774709501025 0.0666101883031


### Grid Search

In [18]:
from itertools import product

n_estimators_ = [20, 40, 60, 80, 100]
max_depth_ = [2, 3, 5]
learning_rate_ = [0.05, 0.1]

results = []
for ne, md, lr in product(n_estimators_, max_depth_, learning_rate_):
    print(ne, md, lr)
    model_ = GradientBoostingClassifier(n_estimators=ne, max_depth=md, learning_rate=lr)
    result = model_selection.cross_val_score(model3, X_transf, y, cv=kfold)
    result_ = { 'n_estimators': ne, 'max_depth': md, 'learning_rate': lr, 
                'accuracy': result.mean(), 'std': result.std() } 

    results.append(result_)

20 2 0.05


NameError: name 'GradientBoostingClassifier' is not defined

In [19]:
import pandas as pd
df = pd.DataFrame(results, columns=['n_estimators', 'max_depth', 'learning_rate', 'accuracy', 'std'])
df.sort_values(by='accuracy', ascending=False)

Unnamed: 0,n_estimators,max_depth,learning_rate,accuracy,std
