In [6]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics
import pandas as pd
from datetime import datetime
start_time = datetime.now()


# CARGAR DATASET
#---------------------------------------------------------------------------------------------
data = pd.read_csv('https://www.dropbox.com/s/cugxdc9mhau4nw1/titanic2.csv?dl=1')
clase_name = 'survived' # nombre de variable a predecir
headers    = data.columns.values.tolist()
headers.remove(clase_name)


# TRAIN y TEST
#---------------------------------------------------------------------------------------------
m_train     = np.random.rand(len(data)) < 0.7
data_train  = data.loc[m_train,headers].values
data_test   = data.loc[~m_train,headers].values
clase_train = data.loc[m_train,clase_name].values
clase_test  = data.loc[~m_train,clase_name].values


# CONVIERTE EN NUMPY.MATRIX. Para mejor performance
# -----------------------------------------------------------------------------------------------
data_train = np.matrix(data_train)
data_test  = np.matrix(data_test) 


# MODELO
#---------------------------------------------------------------------------------------------
modelo = RandomForestClassifier(
 random_state      = 1,   # semilla inicial de aleatoriedad del algoritmo
 n_estimators      = 666, # cantidad de árboles a crear
 min_samples_split = 2,   # cantidad minima de observaciones para dividir un nodo
 min_samples_leaf  = 1,   # observaciones minimas que puede tener una hoja del arbol
 n_jobs            = 1    # tareas en paralelo. para todos los cores disponibles usar -1
 )
modelo.fit(X = data_train, y = clase_train)


# PREDICCION
#---------------------------------------------------------------------------------------------
prediccion = modelo.predict(data_test)


# METRICAS
#---------------------------------------------------------------------------------------------donde:

#precision = (predicciones acertadas de laclase_x) / (total casos predichos para clase_x)
#recall     = (predicciones acertadas de la clase_x) / (total de casos existentes en clase_x)
#f1-score   =   (2 * precision * recall) / (precision + recall)
#support    =      total casos en cada fila

print(metrics.classification_report(y_true=clase_test, y_pred=prediccion))
print(pd.crosstab(clase_test, prediccion, rownames=['REAL'], colnames=['PREDICCION']))


# IMPORTANCIA VARIABLES
#---------------------------------------------------------------------------------------------
var_imp = pd.DataFrame({
 'feature':headers, 
 'v_importance':modelo.feature_importances_.tolist()
 })
print (var_imp.sort_values(by = 'v_importance', ascending=False))


# END
#---------------------------------------------------------------------------------------------
end_time = datetime.now()
print('duracion: ' + format(end_time - start_time))

              precision    recall  f1-score   support

           0       0.74      0.85      0.79       123
           1       0.77      0.64      0.70       100

    accuracy                           0.75       223
   macro avg       0.76      0.74      0.75       223
weighted avg       0.76      0.75      0.75       223

PREDICCION    0   1
REAL               
0           104  19
1            36  64
       feature  v_importance
1         male      0.255444
2          age      0.225408
5         fare      0.219182
0       pclass      0.100786
6    cant_flia      0.066993
3        sibsp      0.044071
4        parch      0.035033
7   frecuencia      0.016628
8   embarked_c      0.016555
9   embarked_s      0.013380
10  embarked_q      0.006520
duracion: 0:00:04.476461
