In [1]:
!pip install hyperopt
!pip install xgboost

In [333]:
from process_meli import *

In [414]:
import pandas as pd
import numpy as np
from datetime import date
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [337]:
def get_item_data_evaluate(json_dic_i):
    '''
    Esta función sirve para extraer los datos de un diccionario con la información
    de un item. Transforma los Tags en variables dummies y filtra solamente las 
    variables que se van a usar en el modelo
    
    '''
    tag_list = ['ahora-12','brand_verified','cart_eligible','dragged_bids_and_visits',
                'good_quality_picture','good_quality_thumbnail','immediate_payment',
                'incomplete_technical_specs','loyalty_discount_eligible','shipping_guaranteed',
                'poor_quality_picture','poor_quality_thumbnail','catalog_listing_eligible',
                'extended_warranty_eligible','lightning_deal','under_infractions','supermarket_eligible',
                'dragged_visits','deal_of_the_day','catalog_forewarning','only_html_description','hirable']

    seller_list = ['brand','credits_active_borrower','credits_priority_2','credits_priority_4','credits_profile',
                   'developer','eshop','large_seller','medium_seller','messages_as_buyer','messages_as_seller',
                   'mshops','normal','user_info_verified','car_dealer','medium_seller_advanced','credits_priority_3',
                   'credits_priority_1','credits_open_market','ngo','from_facebook','real_estate_agency']

    ship_list = ['fs_removed_by_tagger','fulfillment','mandatory_free_shipping','self_service_in','self_service_out',
                'fbm_in_process','is_flammable','me2_blocked','me2_available', 'fbm_in_progress', 'fbm_me2_frozen',
                'adoption_required']
    
    df = pd.DataFrame()
            

    seller_url = 'https://api.mercadolibre.com/users/' + str(json_dic_i.get('seller_id'))
    api_request_seller = requests.get(seller_url)   
    json_dic_i_seller = json.loads(api_request_seller.content) 
        

    dummies_tag_dic = {i:int(i in json_dic_i.get('tags')) for i in tag_list}

    dummies_seller_dic = {i:int(i in json_dic_i_seller.get('tags')) for i in seller_list}

    dummies_ship_dic = {i:int(i in json_dic_i.get('shipping').get('tags')) for i in ship_list}

    if json_dic_i.get('original_price') is None:
        descuento = 0
    else:
        descuento = (json_dic_i.get('original_price')-json_dic_i.get('original_price'))/json_dic_i.get('original_price')
    
    if json_dic_i.get('condition') is None:
        condition = 'not_specified'
    else:
        condition = json_dic_i.get('condition')
    
    if json_dic_i.get('shipping').get('logistic_type') is None:
        shipping_logistic_type = 'not_specified'
    else:
        shipping_logistic_type = json_dic_i.get('shipping').get('logistic_type')    
        
    dic = {'category_id2': json_dic_i.get('category_id'),
           'price': json_dic_i.get('price'),
           'available_quantity':json_dic_i.get('available_quantity'),
           'sold_quantity':json_dic_i.get('sold_quantity'),
           'buying_mode':json_dic_i.get('buying_mode'),
           'listing_type_id':json_dic_i.get('listing_type_id'),
           'condition' : condition,
           'accepts_mercadopago':int(json_dic_i.get('accepts_mercadopago')),
           'descuento':descuento,
           'free_shipping':int(json_dic_i.get('shipping').get('free_shipping')),
           'shipping_mode':json_dic_i.get('shipping').get('mode'),
           'shipping_logistic_type': shipping_logistic_type,
           'shipping_store_pick_up': int(json_dic_i.get('shipping').get('store_pick_up')),
           'seller_transactions_ratings_negative': json_dic_i_seller.get('seller_reputation').get('transactions').get('ratings').get('negative'),
           'seller_transactions_ratings_neutral':json_dic_i_seller.get('seller_reputation').get('transactions').get('ratings').get('neutral'),
           'seller_transactions_ratings_positive':json_dic_i_seller.get('seller_reputation').get('transactions').get('ratings').get('positive'),
           'seller_transactions_total':json_dic_i_seller.get('seller_reputation').get('transactions').get('total'),
           'seller_level_id':json_dic_i_seller.get('seller_reputation').get('level_id'),
           'seller_power_seller_status':json_dic_i_seller.get('seller_reputation').get('power_seller_status'),
           'seller_transactions_canceled': json_dic_i_seller.get('seller_reputation').get('transactions').get('canceled'),
           'seller_transactions_completed':json_dic_i_seller.get('seller_reputation').get('transactions').get('completed'),
           'seller_years_from_registration':years_between_date_and_today(json_dic_i_seller.get('registration_date'))}	

    tmp = pd.DataFrame({**dic, **dummies_tag_dic, **dummies_seller_dic, **dummies_ship_dic}, index=[json_dic_i.get('id')])
    df = pd.concat([df,tmp])
    return df

In [102]:
def inpute_prices(df, price_by_category):
  '''
  Inputa los valores nulos de la variable de precios con la media de
  por categoría de X_Train
  '''
  for i in range(len(price_by_category)):
    price_i = price_by_category.loc[i,'price']
    df.loc[(df.category_id == price_by_category.loc[i,'category_id'])&(df.price.isnull()), 'price'] = price_i
  
  df.loc[df.seller_power_seller_status.isnull(), 'seller_power_seller_status'] = 'None'
  df.loc[df.seller_level_id.isnull(), 'seller_level_id'] = 'None'

  return df

In [377]:
#Encode X_train
def encode_x_train(x):
    '''
    Transforma las variables categóricas en dummies.
    También crea dos listas con los nombres de las variables dummies para que puedan
    ser usadas para encodear el dataset de testing
    '''
    #Columnas categoricas y string
    columns_to_encode = list(x.select_dtypes(include=['category','object']))
    
    #Dataframe encodeado
    encoded_x_train = pd.get_dummies(x[columns_to_encode])
    
    #Columnas
    encoded_columns = encoded_x_train.columns
    
    #Concatenación
    x_encoded = pd.concat([x.drop(columns_to_encode, axis=1), encoded_x_train], axis=1)
    
    total_columns = x_encoded.columns
    
    return (x_encoded, columns_to_encode, encoded_columns, total_columns)
    
def encode_x_test(x, columns_to_encode, encoded_columns, total_columns):
    '''
    Encodea el dataset de testing con las listas generadas por
    la función "encode_x_train" para asegurar que ambos datasets
    tengan las mismas variables
    '''
    #Transformación de variables categoricas en dummies
    x_dummies = pd.get_dummies(x[columns_to_encode])

    #Nombres de las nuevas variables dummies
    x_test_encoded_columns = x_dummies.columns

    #Variables dummies que se encuentran en el dataset de train pero no de test
    missing_columns = [x for x in encoded_columns if x not in x_test_encoded_columns]

    for columns in missing_columns:
        x_dummies[columns] = 0
    
    
    #Concatenación de los dataset
    x_test = pd.concat([x.drop(columns_to_encode, axis=1), x_dummies], axis=1).fillna(0)
    
    #Eliminación de todas las columnas que no estén en X train
    x_test = x_test.loc[:, x_test.columns.isin(total_columns)]
    
    #Dejar el mismo orden
    x_test = x_test[total_columns]
    
    return x_test

In [340]:
def process_dataframe(dataset_path):
    '''
    Lee un dataframe, lo procesa y lo divide en Training y Testing
    
    '''
    
    #Lectura del dataframe
    df = pd.read_csv(dataset_path)
    
    #Crea una variable con las categorías para hacer un split balanceado
    categories = df.category_id
    
    X = df.drop(['id','category_id2', 'sold_quantity'], axis=1, errors='ignore')
    y = df.sold_quantity    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9, test_size=0.2, stratify = categories)
    
    #Calcula el precio mediano por categoría
    median_by_category = X_train.groupby('category_id').price.median().reset_index()
    
    #Imputa los precios medianos 
    X_train_imputed = inpute_prices(X_train, median_by_category)
    X_test_imputed = inpute_prices(X_test, median_by_category)
    
    #One Hot Encoding
    X_train_encoded, columns_to_encode, encoded_columns, total_columns = encode_x_train(X_train_imputed)
    X_test_encoded = encode_x_test(X_test_imputed, columns_to_encode, encoded_columns, total_columns)
    
    return X_train_encoded, X_test_encoded, y_train, y_test, categories, columns_to_encode, encoded_columns, total_columns

In [412]:
def score_cv(params):
    '''
    Entrena un modelo XGBoost Regressor con Cross-Validation
    '''
    model = xgb.XGBRegressor(**params, silent=True)
    kfold = KFold(n_splits=5, random_state=9, shuffle=True)
    score = -1*cross_val_score(model, X_train_encoded, y_train, cv=kfold,scoring='neg_root_mean_squared_error').mean()
    return {'loss': score, 'status': STATUS_OK}    
    xg
def optimize(trials, space, max_evals):
    '''
    Optimización de hiper parámetros
    '''  
    best = fmin(score_cv, space, algo=tpe.suggest, max_evals=max_evals)
    return best

def get_optimum_parameters(max_evals, parameters_path):
    '''
    Obtiene los hiperparámetros óptimos
    '''
    trials = Trials()
    best_params = optimize(trials, space, max_evals)
    
    #Guarda los parametros con Pickle
    f = open(r"Models/"+parameters_path+".pkl","wb")
    pickle.dump(best_parameters,f)
    f.close()


In [417]:
def load_xgboost_model(parameters_path):
    '''
    Lee los parámetros obtenidos por la función "get_optimum_parameters"
    y devuelve un modelo con esos parámetros
    '''
    pickle_in = open(parameters_path,"rb")
    best_parameters = pickle.load(pickle_in)
    model = xgb.XGBRegressor(**best_parameters, silent=True)
    model.fit(X_train_encoded, y_train)
    
    return model

In [410]:
def score_xgboost(model):
    '''
    Calcula el MAE y RMSE por categoría de un modelo XGBoostRegressor
    '''
    
    reporte_final = pd.DataFrame()
    
    for i in categories.unique():

        indice_train_i = X_train.loc[X_train.category_id==i].index

        indice_test_i = X_test.loc[X_test.category_id==i].index

        y_train_predict_i = model.predict(X_train_encoded.loc[indice_train_i, :])

        y_test_predict_i  = model.predict(X_test_encoded.loc[indice_test_i, :])

        y_train_i = y_train.loc[indice_train_i]

        y_test_i = y_test.loc[indice_test_i]

        rmse_train_i = np.sqrt(mean_squared_error(y_train_predict_i, y_train_i))

        rmse_test_i  = np.sqrt(mean_squared_error(y_test_predict_i, y_test_i))

        mae_train_i  = mean_absolute_error(y_train_predict_i, y_train_i)

        mae_test_i   = mean_absolute_error(y_test_predict_i, y_test_i)

        tmp = pd.DataFrame({'category_id': i, 'modelo': 'XGBoostRegressor',
                            'rmse_train': rmse_train_i, 'rmse_test': rmse_test_i,
                            'mae_train' : mae_train_i, 'mae_test' : mae_test_i}, index=[i])

        reporte_final = pd.concat([reporte_final, tmp], axis=0)
    
    return model, reporte_final

In [402]:
def evaluate_item(item):
    '''
    Obtiene la predicción de Sold Quantity para un Item e
    '''
    URL = "https://api.mercadolibre.com/items/" + item
    request = requests.get(URL) 
    json_dic_i = json.loads(request.content)

    category_url = "https://api.mercadolibre.com/categories/"+ json_dic_i.get('category_id')
    request_category = requests.get(category_url) 
    json_dic_category = json.loads(request_category.content)

    tmp = get_item_data_evaluate(json_dic_i)
    tmp['category_id'] = json_dic_category.get('path_from_root')[0].get('id')

    X = tmp.drop(['id','category_id2', 'sold_quantity'], axis=1, errors='ignore')
    y = tmp.sold_quantity  

    item_imputed = inpute_prices(X, median_by_category)
    item_encoded = encode_x_test(item_imputed, columns_to_encode, encoded_columns, total_columns)

    y_pred = model.predict(item_encoded)

    rmse  = np.sqrt(mean_squared_error(y_pred, y))

    mae   = mean_absolute_error(y_pred, y)

    results = {'item': item, 'y': y[0], 'y_pred': y_pred[0], 'rmse': rmse, 'mae': mae}

    return results

## Calculo de Hiperparámetros Óptimos

In [421]:
X_train_encoded, X_test_encoded, y_train, y_test, categories, columns_to_encode, encoded_columns, total_columns = process_dataframe("Data/dataset.csv")

space = {
        'max_depth':hp.choice('max_depth', np.arange(5, 30, 1, dtype=int)),
        'n_estimators':hp.choice('n_estimators', np.arange(10, 200, 10, dtype=int)),
        'colsample_bytree':hp.quniform('colsample_bytree', 0.5, 1.0, 0.1),
        'min_child_weight':hp.choice('min_child_weight', np.arange(5, 55, 6, dtype=int)),
        'subsample':hp.quniform('subsample', 0.7, 0.9, 0.1),
        'eta':hp.quniform('eta', 0.1, 0.3, 0.1),       
        'objective':'reg:squarederror',         
        'eval_metric': 'rmse',
        'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.40, 0.05)),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'n_estimators': hp.choice('n_estimators', np.arange(10, 200, 10, dtype=int)),
        'seed': 9,
    }

get_optimum_parameters(1, "XGBoost_parameters_prueba")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

## Evaluación de performance del modelo

In [420]:
model = load_xgboost_model("Models/XGBoost_parameters.pkl")
reporte_final2 = score_xgboost(model)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




## Evaluación de Item en particular

In [None]:
item = "MLA842101865"
evaluate_item(item)