In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer, confusion_matrix, recall_score, precision_score
from sklearn.svm import SVC
from scipy.stats import randint
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler
import pickle

import os

In [2]:
os.chdir(os.path.dirname(os.getcwd()))

In [3]:
df = pd.read_csv("data/train.csv").reset_index(drop=True)

In [4]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver', 'Descargas', 'Descargas2', 'Categoria', 'tamaño',
       'tamaño2', 'Gratuito', 'tipo_contenido', 'Genero1',
       'dias_sin_actualizar', 'meses_sin_actualizar', 'grupomeses',
       'version_actual', 'grupoversiones', 'version_android', 'Genero2',
       'media_inst_grp_ver', 'mediana_inst_grp_ver', 'med_versandroid'],
      dtype='object')

In [5]:
X = df[["Reviews",'Rating','Categoria', 'tamaño',
       'tamaño2', 'Gratuito', 'tipo_contenido', 'Genero1',
       'dias_sin_actualizar', 'meses_sin_actualizar', 'grupomeses',
       'version_actual', 'grupoversiones', 'version_android', 'Genero2',
       'media_inst_grp_ver', 'mediana_inst_grp_ver', 'med_versandroid']]

y = df['Descargas2']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [7]:
final_test = pd.read_csv("data/test.csv")
final_test_x = final_test[["Reviews",'Rating','Categoria', 'tamaño',
       'tamaño2', 'Gratuito', 'tipo_contenido', 'Genero1',
       'dias_sin_actualizar', 'meses_sin_actualizar', 'grupomeses',
       'version_actual', 'grupoversiones', 'version_android', 'Genero2',
       'media_inst_grp_ver', 'mediana_inst_grp_ver', 'med_versandroid']]

final_test_y = final_test['Descargas2']

In [8]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)


In [9]:
model = RandomForestClassifier()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print(model.score(X_train,y_train))
print("Accuracy Score:",accuracy_score(y_test, predictions))
print("Precision Score:",precision_score(y_test,predictions, average="macro"))
print("Recall Score:",recall_score(y_test, predictions, average="macro"))
print(confusion_matrix(y_test, predictions))

1.0
Accuracy Score: 0.759493670886076
Precision Score: 0.7681378141154274
Recall Score: 0.7565756841984658
[[154  40   0   0   0   0]
 [ 23 129  29   0   0   0]
 [  0  32 178  45   0   0]
 [  0   0  30 175  49   0]
 [  0   0   0  36 265  16]
 [  0   0   0   0  42 179]]


In [10]:
model2 = RandomForestClassifier()
model2.fit(X_train_resampled,y_train_resampled)
predictions2 = model2.predict(X_test)
print(model2.score(X_train_resampled,y_train_resampled))
print("Accuracy Score:",accuracy_score(y_test, predictions2))
print("Precision Score:",precision_score(y_test,predictions2, average="macro"))
print("Recall Score:",recall_score(y_test, predictions2, average="macro"))
print(confusion_matrix(y_test, predictions2))

1.0
Accuracy Score: 0.7609001406469761
Precision Score: 0.7680414867849362
Recall Score: 0.7641774259478585
[[156  38   0   0   0   0]
 [ 20 141  20   0   0   0]
 [  0  44 165  46   0   0]
 [  0   0  34 186  34   0]
 [  0   0   0  42 249  26]
 [  0   0   0   0  36 185]]


In [11]:
predictions2 = model2.predict(final_test_x)
print("Accuracy Score:",accuracy_score(final_test_y, predictions2))
print("Precision Score:",precision_score(final_test_y,predictions2, average="macro"))
print("Recall Score:",recall_score(final_test_y, predictions2, average="macro"))
print(confusion_matrix(final_test_y, predictions2))

Accuracy Score: 0.7677165354330708
Precision Score: 0.7701136855864864
Recall Score: 0.7705144199079595
[[216  39   1   0   0   0]
 [ 15 165  23   2   0   0]
 [  1  38 163  47   0   0]
 [  1   0  47 252  42   0]
 [  0   0   2  61 336  45]
 [  0   0   0   0  49 233]]


In [11]:
y_test

3298    3.0
5027    5.0
2585    4.0
3513    1.0
2338    0.0
       ... 
443     1.0
2805    1.0
1544    5.0
3282    3.0
6829    2.0
Name: Descargas2, Length: 1422, dtype: float64

In [12]:
pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_}).sort_values(by="Importance", ascending=False)

Unnamed: 0,Feature,Importance
0,Reviews,0.451191
1,Rating,0.117069
8,dias_sin_actualizar,0.062551
3,tamaño,0.059453
14,Genero2,0.040879
7,Genero1,0.039862
13,version_android,0.034528
9,meses_sin_actualizar,0.034359
2,Categoria,0.034114
4,tamaño2,0.020915


# RandomForest

#### Con Train Completo, dejamos PCA

In [13]:
pipe = Pipeline(steps=[("scaler", StandardScaler()),
                       ('pca', PCA(n_components=12)),
                       ('classifier', RandomForestClassifier())
])

rf_params = {
    "pca__n_components": randint(6,12),
    'scaler': [StandardScaler(), MinMaxScaler()],
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': randint(100, 1000),
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__max_depth': [None] + list(randint(3, 50).rvs(10)),
    'classifier__min_samples_split': randint(2, 21),
    'classifier__min_samples_leaf': randint(1, 21),
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_leaf_nodes': [None] + list(randint(2, 50).rvs(10)),
    'classifier__min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__random_state': [42],
    'classifier__warm_start': [True, False],
    'classifier__oob_score': [True, False]
}

search_space = [
    rf_params  ]

random_search_rf = RandomizedSearchCV(pipe,
                           search_space,
                           n_iter=500,
                           scoring='f1_macro', 
                           refit='f1_macro',
                           cv=5,
                           n_jobs=-1,
                           verbose=1,
                           random_state=42)


random_search_rf.fit(X_train, y_train)
print(random_search_rf.best_params_)
best_model_rf = random_search_rf.best_estimator_
predictions_rf = best_model_rf.predict(X_test)

print(best_model_rf.score(X_train, y_train))
print(accuracy_score(y_test, predictions_rf))
print(precision_score(y_test,predictions_rf, average="macro"))
print(recall_score(y_test, predictions_rf, average="macro"))
print(confusion_matrix(y_test, predictions_rf))

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
{'classifier': RandomForestClassifier(), 'classifier__class_weight': None, 'classifier__criterion': 'gini', 'classifier__max_depth': 28, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 17, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 963, 'classifier__oob_score': False, 'classifier__random_state': 42, 'classifier__warm_start': True, 'pca__n_components': 11, 'scaler': StandardScaler()}
0.6325597749648383
0.4338959212376934
0.4664213427118003
0.42617144093162773
[[116  32  12  13  20   1]
 [ 16  47  56  28  32   2]
 [  5  44 106  45  49   6]
 [  8  13  69  45 110   9]
 [  4  11  46  33 193  30]
 [  0   0  11   8  92 110]]


#### Con el RandomUnderSampler, dejamos PCA

In [14]:
pipe = Pipeline(steps=[("scaler", StandardScaler()),
                       ('pca', PCA(n_components=12)),
                       ('classifier', RandomForestClassifier())
])

rf_params = {
    "pca__n_components": randint(6,12),
    'scaler': [StandardScaler(), MinMaxScaler()],
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': randint(100, 1000),
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__max_depth': [None] + list(randint(3, 50).rvs(10)),
    'classifier__min_samples_split': randint(2, 21),
    'classifier__min_samples_leaf': randint(1, 21),
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_leaf_nodes': [None] + list(randint(2, 50).rvs(10)),
    'classifier__min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__random_state': [42],
    'classifier__warm_start': [True, False],
    'classifier__oob_score': [True, False]
}

search_space = [
    rf_params  ]

random_search_rf2 = RandomizedSearchCV(pipe,
                           search_space,
                           n_iter=100,
                           scoring='f1_macro', 
                           refit='f1_macro',
                           cv=5,
                           n_jobs=-1,
                           verbose=1,
                           random_state=42)


random_search_rf2.fit(X_train_resampled, y_train_resampled)
print(random_search_rf2.best_params_)
best_model_rf2 = random_search_rf2.best_estimator_
predictions_rf2 = best_model_rf2.predict(X_test)

print(best_model_rf2.score(X_train_resampled, y_train_resampled))
print(accuracy_score(y_test, predictions_rf2))
print(precision_score(y_test,predictions_rf2, average="macro"))
print(recall_score(y_test, predictions_rf2, average="macro"))
print(confusion_matrix(y_test, predictions_rf2))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn(


{'classifier': RandomForestClassifier(), 'classifier__class_weight': 'balanced', 'classifier__criterion': 'gini', 'classifier__max_depth': 15, 'classifier__max_features': None, 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 14, 'classifier__n_estimators': 831, 'classifier__oob_score': False, 'classifier__random_state': 42, 'classifier__warm_start': True, 'pca__n_components': 10, 'scaler': MinMaxScaler()}
0.8475745438362261
0.4050632911392405
0.43090147081951885
0.41398878221847824
[[120  38  11  12  10   3]
 [ 23  56  48  29  20   5]
 [  5  65  87  63  24  11]
 [  4  36  69  77  50  18]
 [  2  20  43  69 114  69]
 [  0   5  17  21  56 122]]


In [15]:
best_model_rf2.fit(X,y)

  warn(
  warn(


In [28]:
best_model_rf2.fit(X,y)
predictions_rf2_final = best_model_rf2.predict(final_test_x)
print(best_model_rf2.score(X, y))
print(accuracy_score(final_test_y, predictions_rf2_final))
print(precision_score(final_test_y,predictions_rf2_final, average="macro"))
print(recall_score(final_test_y, predictions_rf2_final, average="macro"))
print(confusion_matrix(final_test_y, predictions_rf2_final))

  warn(
  warn(


0.30253164556962026
0.2013498312710911
0.24449891909402446
0.21168308662074983
[[ 31  99  74  36  14   2]
 [ 26  77  62  27  12   1]
 [ 31  87  67  37  21   6]
 [ 57  62  95  64  55   9]
 [109  51  97  80  81  26]
 [ 98  11  37  40  58  38]]


# XGB

In [17]:
pipe = Pipeline(steps=[("scaler", StandardScaler()),
                       ('pca', PCA(n_components=12)),
                       ('classifier', RandomForestClassifier())
])

xgb_params = {
    "pca__n_components": randint(6,12),
    "scaler": [StandardScaler(), MinMaxScaler()],
    'classifier': [XGBClassifier()],
    'classifier__n_estimators': randint(100, 1000),
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth': [None] + list(randint(3, 50).rvs(10)),
    'classifier__min_samples_split': randint(2, 21),
    'classifier__min_samples_leaf': randint(1, 21),
    'classifier__bootstrap': [True, False],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_leaf_nodes': [None] + list(randint(2, 50).rvs(10)),
    'classifier__min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__random_state': [42],
    'classifier__warm_start': [True, False],
    'classifier__oob_score': [True, False]
}

search_space = [
    xgb_params  ]

random_search_xgb = RandomizedSearchCV(pipe,
                           search_space,
                           n_iter=500,
                           scoring='f1_macro', 
                           refit='f1_macro',
                           cv=5,
                           n_jobs=-1,
                           verbose=1,
                           random_state=42)


random_search_xgb.fit(X_train, y_train)
print(random_search_xgb.best_params_)
best_model_xgb = random_search_xgb.best_estimator_
predictions_xgb = best_model_xgb.predict(X_test)

print(best_model_xgb.score(X_train, y_train))
print(accuracy_score(y_test, predictions_xgb))
print(precision_score(y_test,predictions_xgb, average="macro"))
print(recall_score(y_test, predictions_xgb, average="macro"))
print(confusion_matrix(y_test, predictions_xgb))

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


Parameters: { "bootstrap", "class_weight", "criterion", "max_features", "max_leaf_nodes", "min_impurity_decrease", "min_samples_leaf", "min_samples_split", "oob_score", "warm_start" } are not used.



{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...), 'classifier__bootstrap': True, 'classifier__class_weight': 'balanced', 'classifier__criterion': 'gini', 'classifier__max_depth': 3, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': 14, 'classifier__min_impurit

In [18]:
best_model_xgb.fit(X,y)
predictions_xgb_final = best_model_xgb.predict(final_test_x)
print(best_model_xgb.score(X, y))
print(accuracy_score(final_test_y, predictions_xgb_final))
print(precision_score(final_test_y,predictions_xgb_final, average="macro"))
print(recall_score(final_test_y, predictions_xgb_final, average="macro"))
print(confusion_matrix(final_test_y, predictions_xgb_final))

Parameters: { "bootstrap", "class_weight", "criterion", "max_features", "max_leaf_nodes", "min_impurity_decrease", "min_samples_leaf", "min_samples_split", "oob_score", "warm_start" } are not used.



0.6974683544303798
0.43250843644544434
0.44908000977977336
0.4249663903120731
[[164  27  30  13  20   2]
 [ 15  46  63  36  40   5]
 [ 10  43  88  36  67   5]
 [  7  32  76  58 152  17]
 [  1  17  60  71 234  61]
 [  0   1  13  13  76 179]]


In [9]:
pipe = Pipeline(steps=[("scaler", StandardScaler()),
                       ('pca', PCA(n_components=12)),
                       ('classifier', RandomForestClassifier())
])

xgb_params = {
    "pca__n_components": randint(6,12),
    "scaler": [StandardScaler(), MinMaxScaler()],
    'classifier': [XGBClassifier()],
    'classifier__n_estimators': randint(100, 1000),
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth': [None] + list(randint(3, 50).rvs(10)),
    'classifier__min_samples_split': randint(7, 50),
    'classifier__min_samples_leaf': randint(7, 50),
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_leaf_nodes': [None] + list(randint(2, 50).rvs(10)),
}

search_space = [
    xgb_params  ]

random_search_xgb2 = RandomizedSearchCV(pipe,
                           search_space,
                           n_iter=100,
                           scoring='f1_macro', 
                           refit='f1_macro',
                           cv=5,
                           n_jobs=-1,
                           verbose=1,
                           random_state=42)


random_search_xgb2.fit(X_train_resampled, y_train_resampled)
print(random_search_xgb2.best_params_)
best_model_xgb2 = random_search_xgb2.best_estimator_
predictions_xgb2 = best_model_xgb2.predict(X_test)

print(best_model_xgb2.score(X_train_resampled, y_train_resampled))
print(accuracy_score(y_test, predictions_xgb2))
print(precision_score(y_test,predictions_xgb2, average="macro"))
print(recall_score(y_test, predictions_xgb2, average="macro"))
print(confusion_matrix(y_test, predictions_xgb2))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Parameters: { "criterion", "max_features", "max_leaf_nodes", "min_samples_leaf", "min_samples_split" } are not used.



{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...), 'classifier__criterion': 'entropy', 'classifier__max_depth': 9, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': 13, 'classifier__min_samples_leaf': 26, 'classifier__min_samples_split': 41, 'classifier__n_esti

In [20]:
best_model_xgb2.fit(X,y)
predictions_xgb2_final = best_model_xgb2.predict(final_test_x)
print(best_model_xgb2.score(X, y))
print(accuracy_score(final_test_y, predictions_xgb2_final))
print(precision_score(final_test_y,predictions_xgb2_final, average="macro"))
print(recall_score(final_test_y, predictions_xgb2_final, average="macro"))
print(confusion_matrix(final_test_y, predictions_xgb2_final))

Parameters: { "bootstrap", "criterion", "max_features", "max_leaf_nodes", "min_impurity_decrease", "min_samples_leaf", "min_samples_split", "oob_score", "warm_start" } are not used.



0.99971870604782
0.42913385826771655
0.4612785444249807
0.4215949811088553
[[160  32  30  17  17   0]
 [ 18  57  51  35  42   2]
 [  8  58  68  51  61   3]
 [  7  42  66  77 138  12]
 [  5  24  66  94 227  28]
 [  0   2  14  20  72 174]]


In [21]:
'''import pickle

filename = 'models/xgb_+rating_MS099_ACC39_PS41_RS40_.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best_model_xgb2, archivo_salida)'''

"import pickle\n\nfilename = 'models/xgb_+rating_MS099_ACC39_PS41_RS40_.pkl'\n\nwith open(filename, 'wb') as archivo_salida:\n    pickle.dump(best_model_xgb2, archivo_salida)"

# Pipeline con varios modelos para ver si mejoramos el 0.4

In [22]:
pipe_multi = Pipeline(steps=[("scaler", StandardScaler()),
                             ('pca', PCA(n_components=12)),
                             ('classifier', RandomForestClassifier())
])

xgb_params = {
    "pca__n_components": randint(6,12),
    "scaler": [StandardScaler(), MinMaxScaler()],
    'classifier': [XGBClassifier()],
    'classifier__n_estimators': randint(50, 500),
    "classifier__learning_rate": [0.001, 0.01, 0.1, 0.5, 1.0],
    "classifier__max_depth": [3, 4, 5, 6, 7],
    "classifier__min_child_weight": randint(1, 11),
    "classifier__subsample": [0.5, 0.7, 0.9, 1.0],
    "classifier__colsample_bytree": [0.5, 0.7, 0.9, 1.0]
    
}

knn_params = {
    "pca__n_components": randint(6,12),
    "scaler": [StandardScaler(), MinMaxScaler()],
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors':randint(1, 20),
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]
}


gb_params = {
    "pca__n_components": randint(6,12),
    "scaler": [StandardScaler(), MinMaxScaler()],
    'classifier': [GradientBoostingClassifier()],
    'classifier__n_estimators': randint(50, 500),
    'classifier__learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0],
    'classifier__max_depth': [3, 4, 5, 6, 7],
    'classifier__min_samples_split': randint(2, 11),
    'classifier__min_samples_leaf': randint(1, 11),
    'classifier__subsample': [0.5, 0.7, 0.9, 1.0]
}

search_space_multi = [
    xgb_params,
    knn_params,
    gb_params  ]


rsm_multi = RandomizedSearchCV(pipe_multi,
                           search_space_multi,
                           n_iter=200,
                           scoring='f1_macro', 
                           refit='f1_macro',
                           cv=5,
                           n_jobs=-1,
                           verbose=1,
                           random_state=42)

rsm_multi.fit(X_train, y_train)
print(rsm_multi.best_params_)
best_model_multi = rsm_multi.best_estimator_
predictions_multi = best_model_multi.predict(X_test)

print(best_model_multi.score(X_train, y_train))
print(accuracy_score(y_test, predictions_multi))
print(precision_score(y_test, predictions_multi, average="macro"))
print(recall_score(y_test, predictions_multi, average="macro"))
print(confusion_matrix(y_test, predictions_multi))

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
{'classifier': GradientBoostingClassifier(), 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 7, 'classifier__n_estimators': 363, 'classifier__subsample': 0.7, 'pca__n_components': 11, 'scaler': StandardScaler()}
0.8570675105485233
0.4381153305203938
0.46293031833654363
0.43914718741762937
[[122  27  14  13  17   1]
 [ 26  55  36  35  28   1]
 [ 15  48  83  60  44   5]
 [  9  24  64  56  96   5]
 [  5  16  47  54 170  25]
 [  0   1   8  21  54 137]]


In [23]:
best_model_multi.fit(X,y)
predictions_multi_final = best_model_multi.predict(final_test_x)
print(best_model_multi.score(X, y))
print(accuracy_score(final_test_y, predictions_multi_final))
print(precision_score(final_test_y,predictions_multi_final, average="macro"))
print(recall_score(final_test_y, predictions_multi_final, average="macro"))
print(confusion_matrix(final_test_y, predictions_multi_final))

0.8174402250351618
0.43194600674915634
0.47127741154163455
0.4213731320662921
[[158  37  27  15  19   0]
 [ 18  55  58  37  37   0]
 [  8  47  78  49  67   0]
 [  8  30  83  64 151   6]
 [  4  18  56  92 249  25]
 [  0   2  15  19  82 164]]


In [24]:
pipe_multi = Pipeline(steps=[("scaler", StandardScaler()),
                             ('pca', PCA(n_components=12)),
                             ('classifier', RandomForestClassifier())
])

xgb_params = {
    "pca__n_components": randint(6,12),
    "scaler": [StandardScaler(), MinMaxScaler()],
    'classifier': [XGBClassifier()],
    'classifier__n_estimators': randint(50, 500),
    "classifier__learning_rate": [0.001, 0.01, 0.1, 0.5, 1.0],
    "classifier__max_depth": [3, 4, 5, 6, 7],
    "classifier__min_child_weight": randint(1, 11),
    "classifier__subsample": [0.5, 0.7, 0.9, 1.0],
    "classifier__colsample_bytree": [0.5, 0.7, 0.9, 1.0]
    
}

knn_params = {
    "pca__n_components": randint(6,12),
    "scaler": [StandardScaler(), MinMaxScaler()],
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors':randint(1, 20),
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]
}


gb_params = {
    "pca__n_components": randint(6,12),
    "scaler": [StandardScaler(), MinMaxScaler()],
    'classifier': [GradientBoostingClassifier()],
    'classifier__n_estimators': randint(50, 500),
    'classifier__learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0],
    'classifier__max_depth': [3, 4, 5, 6, 7],
    'classifier__min_samples_split': randint(2, 11),
    'classifier__min_samples_leaf': randint(1, 11),
    'classifier__subsample': [0.5, 0.7, 0.9, 1.0]
}

search_space_multi2 = [
    xgb_params,
    knn_params,
    gb_params  ]


rsm_multi2 = RandomizedSearchCV(pipe_multi,
                           search_space_multi2,
                           n_iter=200,
                           scoring='f1_macro', 
                           refit='f1_macro',
                           cv=5,
                           n_jobs=-1,
                           verbose=1,
                           random_state=42)

rsm_multi2.fit(X_train_resampled, y_train_resampled)
print(rsm_multi2.best_params_)
best_model_multi2 = rsm_multi2.best_estimator_
predictions_multi2 = best_model_multi2.predict(X_test)

print(best_model_multi2.score(X_train_resampled, y_train_resampled))
print(accuracy_score(y_test, predictions_multi2))
print(precision_score(y_test, predictions_multi2, average="macro"))
print(recall_score(y_test, predictions_multi2, average="macro"))
print(confusion_matrix(y_test, predictions_multi2))

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
{'classifier': GradientBoostingClassifier(), 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 7, 'classifier__n_estimators': 363, 'classifier__subsample': 0.7, 'pca__n_components': 11, 'scaler': StandardScaler()}
0.9080996884735203
0.42545710267229253
0.44908430717208825
0.4339740358141601
[[120  38  16   9  10   1]
 [ 19  60  44  38  18   2]
 [ 11  71  90  38  33  12]
 [ 14  36  75  65  57   7]
 [  3  33  48  62 129  42]
 [  0   5  13  20  42 141]]


In [25]:
'''import pickle

filename = 'models/xgb_+rating_MS078_ACC42_PS44_RS43_.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best_model_multi2, archivo_salida)'''

"import pickle\n\nfilename = 'models/xgb_+rating_MS078_ACC42_PS44_RS43_.pkl'\n\nwith open(filename, 'wb') as archivo_salida:\n    pickle.dump(best_model_multi2, archivo_salida)"

In [26]:
'''{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...), 'classifier__colsample_bytree': 0.9, 
              'classifier__learning_rate': 0.01, 'classifier__max_depth': 7, 'classifier__min_child_weight': 1,
                'classifier__n_estimators': 459, 'classifier__subsample': 0.5, 'pca__n_components': 9, 'scaler': MinMaxScaler()}'''

"{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,\n              colsample_bylevel=None, colsample_bynode=None,\n              colsample_bytree=None, device=None, early_stopping_rounds=None,\n              enable_categorical=False, eval_metric=None, feature_types=None,\n              gamma=None, grow_policy=None, importance_type=None,\n              interaction_constraints=None, learning_rate=None, max_bin=None,\n              max_cat_threshold=None, max_cat_to_onehot=None,\n              max_delta_step=None, max_depth=None, max_leaves=None,\n              min_child_weight=None, missing=nan, monotone_constraints=None,\n              multi_strategy=None, n_estimators=None, n_jobs=None,\n              num_parallel_tree=None, random_state=None, ...), 'classifier__colsample_bytree': 0.9, \n              'classifier__learning_rate': 0.01, 'classifier__max_depth': 7, 'classifier__min_child_weight': 1,\n                'classifier__n_estimators': 459, 'classifier__s

In [27]:
best_model_multi2.fit(X,y)
predictions_multi_final2 = best_model_multi2.predict(final_test_x)
print(best_model_multi2.score(X, y))
print(accuracy_score(final_test_y, predictions_multi_final2))
print(precision_score(final_test_y,predictions_multi_final2, average="macro"))
print(recall_score(final_test_y, predictions_multi_final2, average="macro"))
print(confusion_matrix(final_test_y, predictions_multi_final2))

0.8205344585091421
0.4431946006749156
0.4782091299221034
0.4319264299701994
[[163  33  27  15  18   0]
 [ 18  50  63  36  38   0]
 [  6  41  80  60  60   2]
 [  7  35  74  72 147   7]
 [  1  14  49 105 248  27]
 [  1   1  15  11  79 175]]
