In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
# warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML

import os


In [2]:

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# to tune hiperparameters
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay

In [3]:
# to keep only 4 digits
ROUND = lambda v : round(v, 4)


In [4]:
def set_figure(row, col, suptitle=None, y=0.98, fontsize='xx-large', fontweight='extra bold') :
    u''' Activate matplot figure setting size and super title
    '''
    
    fig = plt.figure(figsize=(row, col));
    if suptitle != None :
        fig.suptitle(suptitle, y=y, 
                     verticalalignment='center', fontsize=fontsize, fontweight=fontweight);
    return fig


### Data

In [5]:
def read_blogData_train() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [6]:
# to_classes = lambda v : 0 if v < 30 else (1 if v < 90 else (2 if v < 150 else (3 if v < 210 else 4)))
# to_classes = lambda v : 0 if v < 30 else (1 if v < 90 else 2)
to_classes = lambda v : 0 if v < 30 else 1

---

In [7]:
import pickle
SAVE_MODELS = True

---


In [8]:
class Eval_Estimator :
    u'''
    '''

    def __init__(self, name, estimator, params=None, gs_param_grid=None) :
        self.name = name
        self.estimator = estimator
        self.params = params
        # attributes used in GridSearchCV
        self.gs_param_grid = gs_param_grid
        self.gs_estimator = None

        return        
# ---

In [9]:
def get_classification_model(pkl_file, X_train, y_train) :
    u'''
    '''
# ---
    # pkl_file = './models/XGBoostClassifier.pkl'
# ---
    try :
        with open(pkl_file, 'rb') as pkl_hand :
            model = pickle.load(pkl_hand)
        print('Se cargo el modelo', model.name, 'de', pkl_file, '...')
    except :
        model = Eval_Estimator(
            name='XGBoost Classifier', 
            estimator=xgb.XGBClassifier(), 
            params={
                'eval_metric' : 'auc', # 
                'gamma' : 0, # (min_split_loss) minimum loss reduction
                'learning_rate' : 0.0001, # (eta) step size shrinkage
                'max_delta_step' : 1e6, # extremely imbalanced
                'max_depth' : 20, # maximum depth of tree
                'n_estimators' : 500, 
                'n_jobs' : -1, # use all processors
                'objective' : 'binary:logistic', # for binary classification 
                'random_state' : 127, 
                'verbosity' : 0, 
            }
        )
    # ---
        model.estimator.set_params(**model.params)
        print('Entrenando modelo', model.name, '...')
        # print(model.estimator.get_params())
        model.estimator.fit(X_train, y_train)
    # ---
        if SAVE_MODELS :
            with open(pkl_file, 'wb') as pkl_hand :
                pickle.dump(model, pkl_hand)
    # ---
    return model
# ---
    

In [10]:
# def evaluate_classification_model() :
#     u'''
#     '''
if True :
# ---    
    data_raw = read_blogData_train()
    # test_raw = blogData_test_read()

    X_train = data_raw.iloc[:, 0:280]
    y_train = data_raw.iloc[:, -1].copy()
    y_train = y_train.apply(to_classes)

    # X_test = test_raw.iloc[:, 0:280]
    # y_test = test_raw.iloc[:, -1].copy()

    X_train = X_train.iloc[:, 0:62].copy()
    # X_test = X_test.iloc[:, 0:62].copy()

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    # X_test = scaler.transform(X_test)

    model = get_classification_model('./models/XGBoostClassifier_min.pkl', X_train, y_train)
# ---

    class Eval_TestCase :
        u'''
        '''
        def __init__(self, case, data) :
            self.case = case
            self.data = data
            self.y_test = None
            self.y_pred = None
            return    
# ---
    filepath = './data/test/'
    filelist = [os.path.join(filepath, filename) for filename in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, filename))]

    caselist = []
    for filename in filelist :
        temp_raw = pd.read_csv(filename, header=None)
        temp_raw.drop_duplicates(inplace=True)
        
        pos = filename.index('2012')
        caselist.append( Eval_TestCase(filename[pos: pos+10], temp_raw) )
# ---
    results = pd.DataFrame(columns=['case', 'count', 'ROC AUC', 'y_test', 'y_pred', 'f1-score'])

    for e in caselist :
        # X = e.data.iloc[:, 0:280]
        X = e.data.iloc[:, 0:62]
        y = e.data.iloc[:, -1]
        y = y.apply(to_classes)
        
        X = scaler.transform(X)
        p = model.estimator.predict(X)
        
        e.y_test = y
        e.y_pred = p
        
        d = classification_report(y_true=y, y_pred=p, output_dict=True)
        
        try :
            roc_auc = ROUND(roc_auc_score(y, p))
            f1 = ROUND(d['1']['f1-score'])
        except :
            rec_auc = -1
            f1 = np.NaN

        results = results.append(
            pd.Series(
                data=[e.case, 
                      y.shape[0], 
                      roc_auc, 
                      y, p, 
                      f1 
                     ], 
                index=results.columns), 
            ignore_index=True
        )
        
    # return results
# ---    


# comment to hide evaluate of regression model
# results = evaluate_classification_model()
display(results[['case', 'count', 'ROC AUC']].sample(5))
display(results[['ROC AUC']].describe().transpose())
display(results[['f1-score']].describe().transpose())

Entrenando modelo XGBoost Classifier ...


Unnamed: 0,case,count,ROC AUC
43,2012.03.15,135,0.8333
22,2012.02.23,122,0.7458
5,2012.02.06,82,0.75
53,2012.03.25,121,1.0
8,2012.02.09,144,0.875


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ROC AUC,60.0,0.742108,0.149739,0.491,0.6551,0.73805,0.842775,1.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
f1-score,59.0,0.541512,0.276659,0.0,0.3818,0.5714,0.697,1.0


In [11]:
def show_roc_curves(results) :
    
    fig = set_figure(row=24, col=40, suptitle='Areas bajo la curva', y=0.99, fontsize='x-large')
    cols = 6
    rows = (results.shape[0] // cols) + 1

    for i in range(results.shape[0]) :
        ax = plt.subplot(rows, cols, i+1)
        y = results.iloc[i]['y_test']
        p = results.iloc[i]['y_pred']

        d = classification_report(y_true=y, y_pred=p, output_dict=True)

        RocCurveDisplay.from_predictions(y_true=y, y_pred=p, name='', ax=ax)
        plt.plot(np.arange(0, 1, step=0.01), np.arange(0, 1, step=0.01), linestyle='-.' )
        try :
            plt.xlabel('precision: {:2.2f} recall: {:2.2f}\n       f1-score: {:2.2f}'.format( d['1']['precision'], d['1']['recall'], d['1']['f1-score'] ))
        except :
            plt.xlabel('')
        plt.ylabel('')
        plt.tight_layout(pad=1.10)
    plt.show()
    return

# show_roc_curves(results)

In [12]:
# roc_auc_limit = 0.5
# mask = results['ROC AUC'] > roc_auc_limit
# print('Hay', results[mask].shape[0], 'Casos con ROC AUC mayor a', roc_auc_limit)
# show_roc_curves(results[mask])

In [13]:
# f1_score_limit = 0.60
# mask = results['f1-score'] > f1_score_limit
# print('Hay', results[mask].shape[0], 'Casos con f1-score mayor a', f1_score_limit)
# show_roc_curves(results[mask])

In [14]:
# roc_auc_limit = 0.5
# mask = results['ROC AUC'] <= roc_auc_limit
# print('Hay ', results[mask].shape[0], 'Casos con ROC AUC menor o igual a', roc_auc_limit)
# show_roc_curves(results[mask])

In [15]:
# fig = set_figure(row=16, col=16, suptitle='Areas bajo la curva', y=0.89, fontsize='x-large')

# cols = 1
# rows = (len(caselist) // cols) + 1
# ax = plt.subplot(1, cols, 1)

# for i, e in enumerate(caselist[:30]) :
#     # ax = plt.subplot(rows, cols, i+1)
#     y = e.y_test
#     p = e.y_pred
#     RocCurveDisplay.from_predictions(y_true=y, y_pred=p, name=None, ax=ax)
#     plt.xlabel('')
#     plt.ylabel('')
    
# plt.show()