In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

print(__doc__)

Automatically created module for IPython interactive environment


In [2]:
# list of (estimator, param_grid), where param_grid is used in GridSearchCV
classifiers = [
    (LogisticRegression(solver='liblinear', random_state=0), {
        'C': np.logspace(-2, 7, 10)
    }),
    (LinearSVC(random_state=0), {
        'C': np.logspace(-2, 7, 10)
    }),
    (make_pipeline(
        KBinsDiscretizer(encode='onehot'),
        LogisticRegression(solver='liblinear', random_state=0)), {
            'kbinsdiscretizer__n_bins': np.arange(2, 5),
            'logisticregression__C': np.logspace(-2, 7, 10),
        }),
    (make_pipeline(
        KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), {
            'kbinsdiscretizer__n_bins': np.arange(2, 5),
            'linearsvc__C': np.logspace(-2, 7, 10),
        }),
    (GradientBoostingClassifier(n_estimators=50, random_state=0), {
        'learning_rate': np.logspace(-4, 0, 10)
    }),
    (SVC(random_state=0, gamma='scale'), {
        'C': np.logspace(-2, 7, 10)
    }),
]

In [3]:
def get_name(estimator):
    name = estimator.__class__.__name__
    if name == 'Pipeline':
        name = [get_name(est[1]) for est in estimator.steps]
        name = ' + '.join(name)
    return name

In [4]:
# De la lista de tuplas Classifiers toma los elementos por separado
# e estiamtor y g gridParameter
names = [get_name(e) for e, g in classifiers]
print (names)

['LogisticRegression', 'LinearSVC', 'KBinsDiscretizer + LogisticRegression', 'KBinsDiscretizer + LinearSVC', 'GradientBoostingClassifier', 'SVC']


In [5]:
import pandas as pd
import os
import numpy as np
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir,'data','processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')
X = train_df.loc[:,'Age':].values.astype('float')
# uso ravel para convertir el dataframe en un arrego 1D 
# podria haber usado train_df['Survived'].values y obtenia el mismo resultado
# si en lugar de tener un dataset hubiese tenido un arreglo no hubiese tenido la propiedad values

y = train_df['Survived'].ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)
print ('mean survival in train : {0:.3f}'.format(np.mean(y_train)))
print ('mean survival in test : {0:.3f}'.format(np.mean(y_test)))

(712, 32) (712,)
(179, 32) (179,)
mean survival in train : 0.383
mean survival in test : 0.385


In [13]:
# iterate over classifiers
estimador = GradientBoostingClassifier(random_state=0)
estimador.get_params().keys()
parameters = {'learning_rate': np.logspace(-4, 0, 10) ,'n_estimators' : np.arange(40,60),'max_depth':np.arange(2,10)}
clf = GridSearchCV(estimator=estimador, param_grid=parameters, cv=5, iid=False)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('%s: %.2f' % (estimator.__class__.__name__, score))
    

SVC: 0.79


In [17]:
score = clf.score(X_test, y_test)
print('%s: %.2f' % (estimador.__class__.__name__, score))


GradientBoostingClassifier: 0.79


In [22]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=0)
estimador.get_params().keys()
#parameters = {'learning_rate': np.logspace(-4, 0, 10) ,'n_estimators' : np.arange(40,60),'max_depth':np.arange(2,10)}
#clf = GridSearchCV(estimator=estimador, param_grid=parameters, cv=5, iid=False)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('%s: %.2f' % (estimador.__class__.__name__, score))

MLPClassifier: 0.61
