# Preparação do ambiente

## Bibliotecas

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os.path as osp
import pandas as pd
import re
import unidecode

from datetime import datetime
from joblib import dump

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier

from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from string import punctuation

In [None]:
import warnings
warnings.filterwarnings("ignore") 

## Constantes e funções auxiliares

In [None]:
MODELS_FOLDER = '../datasets/experimento_gs/best_models/'
FILE_MARKETPLACES = '../datasets/experimento_gs/marketplaces.parquet'
FILE_SUPERVISAO_MERCADO = '../datasets/experimento_gs/supervisao_mercado.xlsx'
FILE_HYPER_PARAMETERS_MODEL = '../datasets/experimento_gs/gs_hyper_paramenters_model.json'
N_JOBS = 6

In [None]:
def load_file_mercado(file_mercado=FILE_SUPERVISAO_MERCADO):
    dict_df_mercado = pd.read_excel(
    file_mercado,
    sheet_name=None,
    usecols=[2,8,10],
    names=['texto_busca', 'titulo', 'passivel_homologacao'],
    true_values=['Sim', 'sim'],
    false_values=['Não','não'],
    na_values=['-'])

    df_list = []
    for key in dict_df_mercado.keys():
        df = dict_df_mercado[key]
        df['marketplace'] = key
        df_list.append(df)
        
    df_mercado = pd.concat(df_list)
    df_mercado = df_mercado.dropna()
    df_mercado['passivel_homologacao'] = df_mercado['passivel_homologacao'].astype(int)
    
    map_marketplaces = {
        'Amazon': 'Amazon', 
        'Americanas': 'Lojas Americanas',
        'CasasBahia': 'Casas Bahia',
        'Magalu': 'Magazine Luiza', 
        'MercadoLivre': 'Mercado Livre'
    }
    
    df_mercado['marketplace'] = df_mercado['marketplace'].map(map_marketplaces)

    return df_mercado

In [None]:
def extract_best_params():
    
    _best_params = {
        'name': gs.best_estimator_['clf'].__class__.__name__,
        'best_params': json.dumps(gs.best_params_),
        'mean_fit_time': gs.cv_results_['mean_fit_time'].mean(),
        'timestamp': datetime.now().timestamp(),
        'train_auc': gs.best_score_,
        'train_accuracy': gs.score(X_test,y_test),
    }

    # test accuracy and roc_auc_score
    test_accuracy = gs.best_estimator_.score(X_test,y_test)
    valid_accuracy = gs.best_estimator_.score(X_valid,y_valid)
    
    if hasattr(gs.best_estimator_['clf'],'predict_proba'):
        y_test_score = gs.best_estimator_.predict_proba(X_test)[:,1]
        y_valid_score = gs.best_estimator_.predict_proba(X_valid)[:,1]
    else:
        y_test_score = gs.best_estimator_.decision_function(X_test)
        y_valid_score = gs.best_estimator_.decision_function(X_valid)
        
    test_auc = roc_auc_score(y_test,y_test_score)
    valid_auc = roc_auc_score(y_valid,y_valid_score)

    _best_params['test_auc'] = test_auc
    _best_params['test_accuracy'] = test_accuracy
    _best_params['valid_auc'] = valid_auc
    _best_params['valid_accuracy'] = valid_accuracy

    # confusion matrix on validation dataset
    y_pred = gs.predict(X_test)
    cm = confusion_matrix(y_test,y_pred)
    tn = int(cm[0][0])
    fn = int(cm[1][0])
    tp = int(cm[1][1])
    fp = int(cm[0][1])
    test_confusion = {
        'tn': tn, 
        'fn': fn, 
        'tp': tp, 
        'fp': fp
    }
    _best_params['test_confusion'] = json.dumps(test_confusion)

    # confusion matrix on validation dataset
    y_pred = gs.predict(X_valid)
    cm = confusion_matrix(y_valid,y_pred)
    tn = int(cm[0][0])
    fn = int(cm[1][0])
    tp = int(cm[1][1])
    fp = int(cm[0][1])
    valid_confusion = {
        'tn': tn, 
        'fn': fn, 
        'tp': tp, 
        'fp': fp
    }
    _best_params['valid_confusion'] = json.dumps(valid_confusion)
       
    return _best_params

In [None]:
def save_best_estimator():
    best_estimator_class_name = gs.best_estimator_['clf'].__class__.__name__
    best_estimator_file_name = f'{MODELS_FOLDER}{best_estimator_class_name}.joblib'
    dump(gs.best_estimator_,best_estimator_file_name)

# Carga e prepação dos dados

In [None]:
df_marketplaces = pd.read_parquet(FILE_MARKETPLACES)
df_mercado = load_file_mercado()

# reduzir a base de dados para construir o notebook
# df_marketplaces = df_marketplaces.sample(10)

# conjunto de dados
docs_marketplaces = df_marketplaces[df_marketplaces['passivel_homologacao']<2]['titulo'].values
targets_marketplaces = df_marketplaces[df_marketplaces['passivel_homologacao']<2]['passivel_homologacao'].values

# Split data to keep experiment results comparable
X_train, X_test, y_train, y_test = train_test_split(
    docs_marketplaces, targets_marketplaces,
    test_size=0.25, 
    random_state=724
)

X_valid, y_valid = df_mercado['titulo'].values, df_mercado['passivel_homologacao'].values

# Análise


## Seleção do melhor modelo

In [None]:
models_best_params = []

### Palavras mais frequentes

In [None]:
stop_words = stopwords.words('portuguese')
stop_words.extend(stopwords.words('english'))
stop_words.extend(list(punctuation))
# stopwords específicas do domínio
stop_words.extend(['cm', 'feature', 'features', 'informações', 'itens', 'leve', 'list', 'nulo', 'package', 'pacote', 'pacotes', 'recurso', 'tamanho', 'ver', 'unidades', 'fio', 'universal'])
# cores mais comumns
stop_words.extend(['preto', 'cinza', 'branco', 'rosa', 'vermelho', 'laranja', 'amarelo', 'verde', 'azul', 'roxo', 'marrom'])

# remover da lista de stopwords a palavra sem para formar o bigrama "sem fio", que pode ser relevante para o domínio
# manter em uma lista separada para avaliar se há efeito no desempenho do classificador
stop_words_wo_sem = stop_words.copy()
stop_words_wo_sem.remove('sem')

### LogisticRegression

In [None]:
%%time
vectorizer = TfidfVectorizer(strip_accents='unicode')
clf = LogisticRegression()

pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = { 
    'vectorizer__max_df': [0.2, 0.4, 0.6, 0.8, 1.0],
    'vectorizer__min_df': [1, 3, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams or trigrams
    'vectorizer__tokenizer': [None, word_tokenize],
    'vectorizer__stop_words': [None, stop_words, stop_words_wo_sem],
    # 'vectorizer__norm': ['l1', 'l2'],
    # 'clf__penalty': ['l1', 'l2', 'elasticnet', None],
    # 'clf__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'clf__tol': np.logspace(-5,0,6),
    'clf__C': np.logspace(-4,4,9),
}

scoring = {"AUC": "roc_auc"}
gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC',n_jobs=N_JOBS,verbose=1)
_=gs.fit(X_train, y_train)

save_best_estimator()
models_best_params.append(extract_best_params())
df = pd.DataFrame(models_best_params)
df = df.sort_values(by='valid_auc',ascending=False)

### LinearSVC

In [None]:
# %%time
# vectorizer = TfidfVectorizer()
# clf = LinearSVC()

# pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

# parameter_grid = {
#     # 'vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
#     # 'vectorizer__min_df': (1, 3, 5, 10),
#     # 'vectorizer__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams or trigrams
#     # 'vectorizer__tokenizer': (None, word_tokenize),
#     'vectorizer__strip_accents': ['ascii', 'unicode'],
#     # 'vectorizer__stop_words': [None, stop_words, stop_words_wo_sem],
#     # 'vectorizer__norm': ("l1", "l2"),
#     'clf__penalty': ('l1', 'l2'),
# }

# scoring = {"AUC": "roc_auc"}
# gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC',n_jobs=N_JOBS,verbose=1)
# _=gs.fit(X_train, y_train)

# models_best_params.append(extract_best_params())

### SGDClassifier

In [None]:
# %%time
# vectorizer = TfidfVectorizer()
# clf = SGDClassifier()

# pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

# parameter_grid = {
#     # 'vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
#     # 'vectorizer__min_df': (1, 3, 5, 10),
#     # 'vectorizer__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams or trigrams
#     # 'vectorizer__tokenizer': (None, word_tokenize),
#     'vectorizer__strip_accents': ['ascii', 'unicode'],
#     # 'vectorizer__stop_words': [None, stop_words, stop_words_wo_sem],
#     # 'vectorizer__norm': ("l1", "l2"),
#     'clf__alpha': (0.5, 1),
# }

# scoring = {"AUC": "roc_auc"}

# gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC',n_jobs=N_JOBS,verbose=1)
# _=gs.fit(X_train, y_train)
# models_best_params.append(extract_best_params())

### NuSVC

In [None]:
# %%time
# vectorizer = TfidfVectorizer()
# clf = NuSVC()

# pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

# parameter_grid = {
#     # 'vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
#     # 'vectorizer__min_df': (1, 3, 5, 10),
#     # 'vectorizer__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams or trigrams
#     # 'vectorizer__tokenizer': (None, word_tokenize),
#     'vectorizer__strip_accents': ['ascii', 'unicode'],
#     # 'vectorizer__stop_words': [None, stop_words, stop_words_wo_sem],
#     # 'vectorizer__norm': ("l1", "l2"),
#     'clf__alpha': (0.5, 1),
# }

# scoring = {"AUC": "roc_auc"}

# gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC',n_jobs=N_JOBS,verbose=1)
# _=gs.fit(X_train, y_train)
# models_best_params.append(extract_best_params())

In [None]:
df_models_best_params = pd.DataFrame(models_best_params)
df_models_best_params