In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML

import os


In [2]:
def set_figure(row, col, suptitle=None) :
    u''' Activate matplot figure setting size and super title
    '''
    fig = plt.figure(figsize=(row, col));
    if suptitle != None :
        fig.suptitle(suptitle, 
                     verticalalignment='center', fontsize='xx-large', fontweight='extra bold');
    return fig

In [3]:

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### Data

In [4]:
data_raw = pd.read_csv("./data/blogData_train.csv", header=None)
data_raw.drop_duplicates(inplace=True)

In [5]:
data_raw.shape

(49203, 281)

In [6]:
# to_classes = lambda v : 0 if v < 30 else (1 if v < 90 else (2 if v < 150 else (3 if v < 210 else 4)))
# to_classes = lambda v : 0 if v < 30 else (1 if v < 90 else 2)
to_classes = lambda v : 0 if v < 30 else 1

In [7]:
X_train = data_raw.iloc[:,0:280]
y_train = data_raw.iloc[:,-1]

y_train = y_train.apply(to_classes)

In [8]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [9]:
# RMSE = lambda v: round(np.sqrt(v), 4)

class Model :
    def __init__(self, name, model) :
        self.name = name
        self.model = model
        return    


In [10]:

models = []


# models.append( Model('XGB Classifier default *', 
#                      xgb.XGBClassifier(
#                          eval_metric='auc', # 
#                          n_jobs=-1, # use all processors
#                          objective='binary:logistic', # for binary classification 
#                          objective='multi:softprob', # for multi-class classification
#                          random_state=127, 
#                      ) 
#                     ) 
#              )

models.append( Model('XGB Classifier special 1', 
                     xgb.XGBClassifier(
                         eval_metric='auc', # 
                         gamma=0, # (min_split_loss=0) minimum loss reduction
                         learning_rate=0.1, # (eta=0.3) step size shrinkage
                         max_depth=6, # maximum depth of tree
                         n_estimators=100, # 
                         n_jobs=-1, # use all processors
                         objective='binary:logistic', # for binary classification 
                         # objective='multi:softprob', # for multi-class classification
                         random_state=127, 
                         # subsample=0.1, # prevents overfitting
                     ) 
                    ) 
             )

models.append( Model('XGB Classifier special 2', 
                     xgb.XGBClassifier(
                         eval_metric='auc', # 
                         gamma=0, # (min_split_loss=0) minimum loss reduction
                         learning_rate=0.1, # (eta) step size shrinkage
                         max_depth=6, # maximum depth of tree
                         n_estimators=500, # 
                         n_jobs=-1, # use all processors
                         objective='binary:logistic', # for binary classification 
                         # objective='multi:softprob', # for multi-class classification
                         random_state=127, 
                         # subsample=0.5, # prevents overfitting
                     ) 
                    ) 
             )

models.append( Model('XGB Classifier special 3', 
                     xgb.XGBClassifier(
                         eval_metric='auc', # 
                         gamma=10, # (min_split_loss=0) minimum loss reduction
                         learning_rate=0.1, # (eta) step size shrinkage
                         max_depth=6, # maximum depth of tree
                         n_estimators=100, # 
                         n_jobs=-1, # use all processors
                         objective='binary:logistic', # for binary classification 
                         # objective='multi:softprob', # for multi-class classification
                         random_state=127, 
                         # subsample=0.1, # prevents overfitting
                     ) 
                    ) 
             )

for m in models :
    m.model.fit(X_train, y_train)
    y_pred = m.model.predict(X_train)
    display(HTML('<b>' + m.name + ' TRAIN</b>'))
    print(classification_report(y_train, y_pred, digits=6, target_names=None, output_dict=False))


              precision    recall  f1-score   support

           0   0.985262  0.996764  0.990980     47284
           1   0.888076  0.632621  0.738892      1919

    accuracy                       0.982562     49203
   macro avg   0.936669  0.814693  0.864936     49203
weighted avg   0.981472  0.982562  0.981148     49203



              precision    recall  f1-score   support

           0   0.985262  0.996764  0.990980     47284
           1   0.888076  0.632621  0.738892      1919

    accuracy                       0.982562     49203
   macro avg   0.936669  0.814693  0.864936     49203
weighted avg   0.981472  0.982562  0.981148     49203



              precision    recall  f1-score   support

           0   0.984893  0.996849  0.990835     47284
           1   0.889219  0.623241  0.732843      1919

    accuracy                       0.982278     49203
   macro avg   0.937056  0.810045  0.861839     49203
weighted avg   0.981161  0.982278  0.980773     49203



In [11]:
# models[0].model.get_params()

In [12]:
# fig = set_figure(20, 10, 'Matrices de confusión')
# cols = 3
# rows = (len(models) // cols) + 1

# for i, m in enumerate(models) :
#     y_pred = m.model.predict(X_train)
    
#     plt.subplot(rows, cols, i+1)
#     ax=sns.heatmap(confusion_matrix(y_train, y_pred), annot=True, fmt="d", cmap='Blues', cbar=False)
#     plt.tight_layout();
#     plt.title(m.name);
#     plt.ylabel('Actual Values')
#     plt.xlabel('Predicted Values')
   


---

In [13]:

filepath = './data/test/'
filelist = [os.path.join(filepath, filename) for filename in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, filename))]

test_raw = pd.DataFrame()

for filename in filelist :
    temp_raw = pd.read_csv(filename, header=None)
    temp_raw.drop_duplicates(inplace=True)
    test_raw = test_raw.append(temp_raw)

X_test = test_raw.iloc[:,0:280]
y_test = test_raw.iloc[:,-1]

y_test = y_test.apply(to_classes)

# using train scaler
X_test = scaler.transform(X_test)

for m in models :
    y_pred = m.model.predict(X_test)
    display(HTML('<b>' + m.name + ' TEST</b>'))
    print(classification_report(y_test, y_pred, digits=6, target_names=None, output_dict=False))


              precision    recall  f1-score   support

           0   0.979161  0.995879  0.987449      6794
           1   0.777778  0.404959  0.532609       242

    accuracy                       0.975554      7036
   macro avg   0.878469  0.700419  0.760029      7036
weighted avg   0.972234  0.975554  0.971805      7036



              precision    recall  f1-score   support

           0   0.979161  0.995879  0.987449      6794
           1   0.777778  0.404959  0.532609       242

    accuracy                       0.975554      7036
   macro avg   0.878469  0.700419  0.760029      7036
weighted avg   0.972234  0.975554  0.971805      7036



              precision    recall  f1-score   support

           0   0.979574  0.995290  0.987369      6794
           1   0.759398  0.417355  0.538667       242

    accuracy                       0.975412      7036
   macro avg   0.869486  0.706323  0.763018      7036
weighted avg   0.972001  0.975412  0.971937      7036



---
