# Preparação do ambiente

## Bibliotecas

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import os.path as osp
import pandas as pd
import re
import unidecode

from datetime import datetime
from joblib import load, dump
from os import cpu_count

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier

from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


from string import punctuation

In [2]:
import warnings
warnings.filterwarnings("ignore") 

if os.name=='posix':
    os.environ['PYTHONWARNINGS']='ignore'


## Constantes e funções auxiliares

In [3]:
FILE_MARKETPLACES = '../datasets/experimento_gs/marketplaces.parquet'
FILE_SUPERVISAO_MERCADO = '../datasets/experimento_gs/supervisao_mercado.xlsx'
FILE_MODEL_CLF3 = '../datasets/experimento_gs/exp_clf3_model.joblib'

FILE_HYPER_PARAMETERS_VECTORIZERS = '../datasets/experimento_gs/gs_hyper_paramenters_vectorizers.json'
FILE_HYPER_PARAMETERS_CLASSIFIERS = '../datasets/experimento_gs/gs_hyper_paramenters_classifiers.json'
FILE_HYPER_PARAMETERS_CLF_VECTORIZERS = '../datasets/experimento_gs/gs_hyper_paramenters_clf_vectorizer.json'

N_JOBS = os.cpu_count()//2

In [4]:
def load_file_mercado(file_mercado=FILE_SUPERVISAO_MERCADO):
    dict_df_mercado = pd.read_excel(
    file_mercado,
    sheet_name=None,
    usecols=[2,8,10],
    names=['texto_busca', 'titulo', 'passivel_homologacao'],
    true_values=['Sim', 'sim'],
    false_values=['Não','não'],
    na_values=['-'])

    df_list = []
    for key in dict_df_mercado.keys():
        df = dict_df_mercado[key]
        df['marketplace'] = key
        df_list.append(df)
        
    df_mercado = pd.concat(df_list)
    df_mercado = df_mercado.dropna()
    df_mercado['passivel_homologacao'] = df_mercado['passivel_homologacao'].astype(int)
    
    map_marketplaces = {
        'Amazon': 'Amazon', 
        'Americanas': 'Lojas Americanas',
        'CasasBahia': 'Casas Bahia',
        'Magalu': 'Magazine Luiza', 
        'MercadoLivre': 'Mercado Livre'
    }
    
    df_mercado['marketplace'] = df_mercado['marketplace'].map(map_marketplaces)

    return df_mercado

In [5]:
def load_best_params(file):

    with open(file) as f:
        clf_best_params = json.load(f)
        
    return pd.DataFrame(clf_best_params)

# Carga e prepação dos dados

In [6]:
df_marketplaces = pd.read_parquet(FILE_MARKETPLACES)
df_mercado = load_file_mercado()

# reduzir a base de dados para construir o notebook
# df_marketplaces = df_marketplaces.sample(10)

# conjunto de dados
docs_marketplaces = df_marketplaces[df_marketplaces['passivel_homologacao']<2]['titulo'].values
targets_marketplaces = df_marketplaces[df_marketplaces['passivel_homologacao']<2]['passivel_homologacao'].values

# Split data to keep experiment results comparable
X_train, X_test, y_train, y_test = train_test_split(
    docs_marketplaces, targets_marketplaces,
    test_size=0.25, 
    random_state=724
)

X_valid, y_valid = df_mercado['titulo'].values, df_mercado['passivel_homologacao'].values

# Análise


https://levity.ai/blog/text-classifiers-in-machine-learning-a-practical-guide

## Ajustando hiperparâmetros do vetorizador

In [7]:
start_time = datetime.now()

### Pré processamento do texto

Em experimentos anteriores, não extensamente documentados, observei que utilizar uma função para pré processar o texto é mais lento que processar o texto junto com o vetorizador do scikit-learn, assim, será preparada a lista de stop_words, deixando o processo de normalização (passar todas as palavras para minúsculas e remover acentos para o vetorizador)

In [8]:
stop_words = stopwords.words('portuguese')
stop_words.extend(stopwords.words('english'))
stop_words.extend(list(punctuation))
# stopwords específicas do domínio
stop_words.extend(['cm', 'feature', 'features', 'informações', 'itens', 'leve', 'list', 'nulo', 'package', 'pacote', 'pacotes', 'recurso', 'tamanho', 'ver', 'unidades', 'fio', 'universal'])
# cores mais comumns
stop_words.extend(['preto', 'cinza', 'branco', 'rosa', 'vermelho', 'laranja', 'amarelo', 'verde', 'azul', 'roxo', 'marrom'])
# remover da lista de stopwords a palavra sem para formar o bigrama "sem fio", que pode ser relevante para o domínio
# stop_words.remove('sem')

### Vectorizer Grid Search

In [9]:
%%time
vectorizer = 'passthrough'
clf = LogisticRegression(C=10)
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = [
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__tokenizer': (None, word_tokenize),
        'vectorizer__strip_accents': ['unicode'],
        'vectorizer__stop_words': [None, stop_words],
        'vectorizer__ngram_range': ((1,1),(1,2),(1,3)),
        'vectorizer__max_df': np.linspace(0,1,5),
        'vectorizer__min_df': np.linspace(0,1,5),
    },
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__tokenizer': (None, word_tokenize),
        'vectorizer__use_idf': (True, False),
        'vectorizer__tokenizer': (None, word_tokenize),
        'vectorizer__strip_accents': ['unicode'],
        'vectorizer__stop_words': [None, stop_words],
        'vectorizer__ngram_range': ((1,1),(1,2),(1,3)),
        'vectorizer__max_df': np.linspace(0,1,5),
        'vectorizer__min_df': np.linspace(0,1,5),
        'vectorizer__norm': (None, 'l1', 'l2'),
    }
]

scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC',n_jobs=N_JOBS,verbose=1)
_=gs.fit(X_train, y_train)

Fitting 5 folds for each of 2100 candidates, totalling 10500 fits
CPU times: user 1min 54s, sys: 11.4 s, total: 2min 5s
Wall time: 7min 21s


In [10]:
gs.best_params_

{'vectorizer': TfidfVectorizer(),
 'vectorizer__max_df': 0.25,
 'vectorizer__min_df': 0.0,
 'vectorizer__ngram_range': (1, 2),
 'vectorizer__norm': 'l2',
 'vectorizer__stop_words': None,
 'vectorizer__strip_accents': 'unicode',
 'vectorizer__tokenizer': None,
 'vectorizer__use_idf': True}

### Vectorizer/Classifier Grid Search

In [11]:
df_best_clf = load_best_params(FILE_HYPER_PARAMETERS_CLASSIFIERS)
# manter apenas clf com resultados melhores que o experimento 3
df_best_clf = df_best_clf[(df_best_clf['test_auc']>0.957)&(df_best_clf['valid_auc']>0.926)]
df_best_clf

Unnamed: 0,name,best_params,mean_fit_time,timestamp,train_auc,train_accuracy,test_auc,test_accuracy,valid_auc,valid_accuracy,confusion
0,NuSVC,"{""clf__kernel"": ""rbf"", ""clf__nu"": 0.25, ""clf__...",11.354448,1727125000.0,0.969549,0.971635,0.971635,0.927799,0.957585,0.907801,"{""tn"": 103, ""fn"": 18, ""tp"": 665, ""fp"": 60}"
1,RandomForestClassifier,"{""clf__criterion"": ""log_loss"", ""clf__max_featu...",57.431853,1727127000.0,0.969653,0.971072,0.971072,0.915153,0.964888,0.888889,"{""tn"": 82, ""fn"": 13, ""tp"": 670, ""fp"": 81}"
2,SGDClassifier,"{""clf__alpha"": 1e-05, ""clf__loss"": ""log_loss"",...",0.39511,1727724000.0,0.967485,0.969976,0.969976,0.917452,0.953911,0.886525,"{""tn"": 86, ""fn"": 19, ""tp"": 664, ""fp"": 77}"
3,LogisticRegression,"{""clf__C"": 10.0, ""clf__penalty"": ""l2""}",0.549632,1727725000.0,0.96777,0.969649,0.969649,0.916762,0.955196,0.891253,"{""tn"": 92, ""fn"": 21, ""tp"": 662, ""fp"": 71}"
4,LinearSVC,"{""clf__C"": 1.0, ""clf__loss"": ""squared_hinge"", ...",0.277607,1727724000.0,0.966858,0.968364,0.968364,0.919522,0.951881,0.895981,"{""tn"": 96, ""fn"": 21, ""tp"": 662, ""fp"": 67}"
5,RidgeClassifier,"{""clf__solver"": ""sparse_cg""}",4.685139,1727725000.0,0.966041,0.96669,0.96669,0.915153,0.94774,0.894799,"{""tn"": 98, ""fn"": 24, ""tp"": 659, ""fp"": 65}"
6,ComplementNB,"{""clf__alpha"": 1.0}",0.415025,1727725000.0,0.958971,0.95704,0.95704,0.902046,0.954657,0.880615,"{""tn"": 72, ""fn"": 10, ""tp"": 673, ""fp"": 91}"
7,MultinomialNB,"{""clf__alpha"": 1.0}",0.492348,1727725000.0,0.958971,0.95704,0.95704,0.894229,0.954657,0.875887,"{""tn"": 62, ""fn"": 4, ""tp"": 679, ""fp"": 101}"


In [12]:
best_clf = df_best_clf[['name','best_params']].to_dict('records')
best_clf_params = []
for clf in best_clf:
    clf_params = {'clf': clf['name']}
    for key,value in json.loads(clf['best_params']).items():
        clf_params[key] = value
    best_clf_params.append(clf_params)
best_clf_params

[{'clf': 'NuSVC', 'clf__kernel': 'rbf', 'clf__nu': 0.25, 'clf__tol': 0.01},
 {'clf': 'RandomForestClassifier',
  'clf__criterion': 'log_loss',
  'clf__max_features': 'log2',
  'clf__n_estimators': 500},
 {'clf': 'SGDClassifier',
  'clf__alpha': 1e-05,
  'clf__loss': 'log_loss',
  'clf__penalty': 'l2'},
 {'clf': 'LogisticRegression', 'clf__C': 10.0, 'clf__penalty': 'l2'},
 {'clf': 'LinearSVC',
  'clf__C': 1.0,
  'clf__loss': 'squared_hinge',
  'clf__tol': 1.0},
 {'clf': 'RidgeClassifier', 'clf__solver': 'sparse_cg'},
 {'clf': 'ComplementNB', 'clf__alpha': 1.0},
 {'clf': 'MultinomialNB', 'clf__alpha': 1.0}]

In [13]:
%%time
vectorizer = TfidfVectorizer(
    max_df=0.25, 
    min_df=0.0, 
    ngram_range=(1, 2),
    strip_accents='unicode')

clf = 'passthrough'

pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = [
    {'clf': [NuSVC()], 'clf__kernel': ['rbf'], 'clf__nu': [0.25], 'clf__tol': [0.01]},
    {'clf': [RandomForestClassifier()],  'clf__criterion': ['log_loss'],  'clf__max_features': ['log2'],  'clf__n_estimators': [500]},
    {'clf': [SGDClassifier()],  'clf__alpha': [1e-05],  'clf__loss': ['log_loss'],  'clf__penalty': ['l2']},
    {'clf': [LogisticRegression()], 'clf__C': [10.0], 'clf__penalty': ['l2']}, 
    {'clf': [LinearSVC()],  'clf__C': [1.0],  'clf__loss': ['squared_hinge'],  'clf__tol': [1.0]},
    {'clf': [RidgeClassifier()], 'clf__solver': ['cholesky']}, 
    {'clf': [ComplementNB()], 'clf__alpha': [1.0]}, 
    {'clf': [MultinomialNB()], 'clf__alpha': [1.0]}]

scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC',n_jobs=N_JOBS,verbose=1)
_=gs.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
CPU times: user 8.3 s, sys: 739 ms, total: 9.04 s
Wall time: 3min 44s


In [14]:
df_vectorizer_results = pd.DataFrame(gs.cv_results_)
df_vectorizer_results['name'] = df_vectorizer_results['param_clf'].apply(lambda clf: clf.__class__.__name__)
columns_to_keep = ['name', 'params', 'mean_fit_time', 'mean_test_AUC', 'rank_test_AUC']
df_vectorizer_results = df_vectorizer_results[columns_to_keep].sort_values(by='mean_test_AUC',ascending=False)
df_vectorizer_results = df_vectorizer_results.reset_index(drop=True)
df_vectorizer_results

Unnamed: 0,name,params,mean_fit_time,mean_test_AUC,rank_test_AUC
0,LogisticRegression,"{'clf': LogisticRegression(), 'clf__C': 10.0, ...",1.922004,0.97088,1
1,LinearSVC,"{'clf': LinearSVC(), 'clf__C': 1.0, 'clf__loss...",1.48149,0.970833,2
2,SGDClassifier,"{'clf': SGDClassifier(), 'clf__alpha': 1e-05, ...",1.525433,0.970828,3
3,NuSVC,"{'clf': NuSVC(), 'clf__kernel': 'rbf', 'clf__n...",70.06885,0.970747,4
4,RandomForestClassifier,"{'clf': RandomForestClassifier(), 'clf__criter...",185.299971,0.966551,5
5,ComplementNB,"{'clf': ComplementNB(), 'clf__alpha': 1.0}",1.48586,0.961284,6
6,MultinomialNB,"{'clf': MultinomialNB(), 'clf__alpha': 1.0}",1.482766,0.961284,6
7,RidgeClassifier,"{'clf': RidgeClassifier(), 'clf__solver': 'cho...",1.461512,,8


In [15]:
df_best_clf_final = df_best_clf[['name','test_auc']].merge(df_vectorizer_results[['name','mean_test_AUC','rank_test_AUC']],how='left')
df_best_clf_final['previous_rank'] = df_best_clf_final.index.values+1

columns_to_keep = ['name', 'test_auc', 'mean_test_AUC', 'previous_rank', 'rank_test_AUC']
df_best_clf_final = df_best_clf_final[columns_to_keep]
columns_to_keep = ['name', 'previous_test_auc', 'actual_test_auc', 'previous_rank', 'actual_rank']
df_best_clf_final.columns = columns_to_keep

df_best_clf_final['delta_test_auc'] = df_best_clf_final['actual_test_auc'] - df_best_clf_final['previous_test_auc']
df_best_clf_final['delta_rank'] = df_best_clf_final['previous_rank'] - df_best_clf_final['actual_rank']

df_best_clf_final = df_best_clf_final.sort_values(by='actual_test_auc',ascending=False).reset_index(drop=True)

df_best_clf_final

Unnamed: 0,name,previous_test_auc,actual_test_auc,previous_rank,actual_rank,delta_test_auc,delta_rank
0,LogisticRegression,0.969649,0.97088,4,1,0.001231,3
1,LinearSVC,0.968364,0.970833,5,2,0.002468,3
2,SGDClassifier,0.969976,0.970828,3,3,0.000853,0
3,NuSVC,0.971635,0.970747,1,4,-0.000888,-3
4,RandomForestClassifier,0.971072,0.966551,2,5,-0.004521,-3
5,ComplementNB,0.95704,0.961284,7,6,0.004244,1
6,MultinomialNB,0.95704,0.961284,8,6,0.004244,2
7,RidgeClassifier,0.96669,,6,8,,-2


In [16]:
df_best_clf_final

Unnamed: 0,name,previous_test_auc,actual_test_auc,previous_rank,actual_rank,delta_test_auc,delta_rank
0,LogisticRegression,0.969649,0.97088,4,1,0.001231,3
1,LinearSVC,0.968364,0.970833,5,2,0.002468,3
2,SGDClassifier,0.969976,0.970828,3,3,0.000853,0
3,NuSVC,0.971635,0.970747,1,4,-0.000888,-3
4,RandomForestClassifier,0.971072,0.966551,2,5,-0.004521,-3
5,ComplementNB,0.95704,0.961284,7,6,0.004244,1
6,MultinomialNB,0.95704,0.961284,8,6,0.004244,2
7,RidgeClassifier,0.96669,,6,8,,-2
