# Preparação do ambiente

## Bibliotecas

In [33]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from datetime import datetime
from joblib import load

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier

from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore") 

## Constantes e funções auxiliares

In [3]:
FILE_MARKETPLACES = '../datasets/label_issues/20240423/results/marketplaces.parquet'
FILE_SUPERVISAO_MERCADO = '../datasets/supervisao_mercado/supervisao_mercado.xlsx'
FILE_MODEL_CLF3 = '../experimentos/assets/exp_clf3_model.joblib'

In [4]:
def load_file_mercado(file_mercado=FILE_SUPERVISAO_MERCADO):
    dict_df_mercado = pd.read_excel(
    file_mercado,
    sheet_name=None,
    usecols=[2,8,10],
    names=['texto_busca', 'titulo', 'passivel_homologacao'],
    true_values=['Sim', 'sim'],
    false_values=['Não','não'],
    na_values=['-'])

    df_list = []
    for key in dict_df_mercado.keys():
        df = dict_df_mercado[key]
        df['marketplace'] = key
        df_list.append(df)
        
    df_mercado = pd.concat(df_list)
    df_mercado = df_mercado.dropna()
    df_mercado['passivel_homologacao'] = df_mercado['passivel_homologacao'].astype(int)
    
    map_marketplaces = {
        'Amazon': 'Amazon', 
        'Americanas': 'Lojas Americanas',
        'CasasBahia': 'Casas Bahia',
        'Magalu': 'Magazine Luiza', 
        'MercadoLivre': 'Mercado Livre'
    }
    
    df_mercado['marketplace'] = df_mercado['marketplace'].map(map_marketplaces)

    return df_mercado

In [5]:
def tokenizer(doc):
    doc = doc.lower()
    doc = unidecode.unidecode(doc)
    
    # uniformizar termos
    patterns = [('wi\s?.?fi', 'wifi'), 
                ('2,4', '2.4'), 
                ('5,8', '5.8')]
    for pattern, repl in patterns:
        doc = re.sub(pattern, repl, doc)
    
    stop_words = stopwords.words('portuguese')
    stop_words.extend(stopwords.words('english'))
    stop_words.extend(list(punctuation))
    # stopwords específicas do domínio
    stop_words.extend(['cm', 'feature', 'features', 'informações', 'itens', 'leve', 'list', 'nulo', 'package', 'pacote', 'pacotes', 'recurso', 'tamanho', 'ver', 'unidades', 'fio', 'universal'])
    # cores mais comumns
    stop_words.extend(['preto', 'cinza', 'branco', 'rosa', 'vermelho', 'laranja', 'amarelo', 'verde', 'azul', 'roxo', 'marrom'])
    # remover da lista de stopwords a palavra sem para formar o bigrama "sem fio", que pode ser relevante para o domínio
    # stop_words.remove('sem') 
    
    tokens = [token for token in word_tokenize(doc) if token not in stop_words]

    # tokens = [token for token in re.findall(r'\b\w\w+\b',doc) if token not in stop_words]
    
    return tokens

def clean_text(doc):
    tokens = tokenizer(doc)
    return ' '.join(tokens) 

In [6]:
def clf_score(clf, X_test, y_test):
    
    try:
        auc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
    except:
        auc = roc_auc_score(y_test,clf.decision_function(X_test))
    
    score = {
            'Model name': clf['clf'].__class__.__name__,
            'AUC': auc}

    return score

In [7]:
def save_best_params():
    
    clf_best_params = {
        'name': pipe['clf'].__class__.__name__,
        'auc': gs.best_score_,
        'best_params': json.dumps(gs.best_params_)
    }
    
    gs_best_params.append(clf_best_params)
    print(clf_best_params)
    
    with open('best_params.json', 'w+') as f:
        json.dump(gs_best_params,f)    

# Carga e prepação dos dados

In [8]:
df_marketplaces = pd.read_parquet(FILE_MARKETPLACES)
df_mercado = load_file_mercado()

# reduzir a base de dados para construir o notebook
# df_marketplaces = df_marketplaces.sample(10)

# conjunto de dados
docs_marketplaces = df_marketplaces[df_marketplaces['passivel_homologacao']<2]['titulo'].values
targets_marketplaces = df_marketplaces[df_marketplaces['passivel_homologacao']<2]['passivel_homologacao'].values


# gridsearch does not need test subset
X_train, y_train = docs_marketplaces, targets_marketplaces

X_valid, y_valid = df_mercado['titulo'].values, df_mercado['passivel_homologacao'].values

# Análise


https://levity.ai/blog/text-classifiers-in-machine-learning-a-practical-guide

In [9]:
gs_best_params = []

## Classificador 3.1 do experimento de classificação 3 (linha base)

In [10]:
baseline_clf = load(FILE_MODEL_CLF3)

clf_name = baseline_clf['clf'].__class__.__name__
baseline_score = clf_score(baseline_clf, X_valid, y_valid)
baseline_score

{'Model name': 'SGDClassifier', 'AUC': 0.9592648815672467}

In [11]:
# fig, ax = plt.subplots(1,1,figsize=(5,5))

# x = baseline_score['Accuracy']
# y = baseline_score['Area under the ROC curve']

# ax.set_xlim(0,1)
# ax.set_ylim(0,1)

# _=ax.scatter(x,y, label='SGD')
# _=ax.scatter(x*0.9,y*0.5, label='NB')

# ax.legend()

## Buscar outros modelos de classificadores

In [12]:
start_time = datetime.now()

### Naive-Bayes

In [13]:
%%time
clf = BernoulliNB()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__alpha": np.logspace(-6, 6, 13),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)

save_best_params()

{'name': 'BernoulliNB', 'auc': 0.9558841708247019, 'best_params': '{"clf__alpha": 1.0}'}
CPU times: total: 984 ms
Wall time: 22.2 s


In [14]:
%%time
clf = ComplementNB()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__alpha": np.logspace(-6, 6, 13),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'ComplementNB', 'auc': 0.9590248791557359, 'best_params': '{"clf__alpha": 1.0}'}
CPU times: total: 562 ms
Wall time: 4.17 s


In [15]:
%%time
clf = MultinomialNB()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__alpha": np.logspace(-6, 6, 13),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe, parameter_grid, scoring=scoring, refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'MultinomialNB', 'auc': 0.9590248791557359, 'best_params': '{"clf__alpha": 1.0}'}
CPU times: total: 734 ms
Wall time: 4.34 s


### kNN

In [16]:
%%time
clf = KNeighborsClassifier()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__n_neighbors": (3, 5, 10),
    "clf__p": (1, 2)
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'KNeighborsClassifier', 'auc': 0.9560730060566677, 'best_params': '{"clf__n_neighbors": 10, "clf__p": 2}'}
CPU times: total: 266 ms
Wall time: 27.6 s


### Linear models

#### SGD

In [17]:
%%time
clf = SGDClassifier()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__alpha": np.logspace(-6, 6, 13),
    "clf__loss": ('hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron'),
    "clf__penalty": ('l1', 'l2', 'elasticnet'),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'SGDClassifier', 'auc': 0.9682695039811327, 'best_params': '{"clf__alpha": 1e-05, "clf__loss": "log_loss", "clf__penalty": "l2"}'}
CPU times: total: 7.25 s
Wall time: 1min 13s


#### Logistic Regression

In [18]:
%%time
clf = LogisticRegression(solver='liblinear')

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__C": np.logspace(-5, 5, 11),
    "clf__penalty": ('l1', 'l2'),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'LogisticRegression', 'auc': 0.9681783664152812, 'best_params': '{"clf__C": 10.0, "clf__penalty": "l2"}'}
CPU times: total: 1.86 s
Wall time: 11.4 s


#### Ridge

In [19]:
%%time
clf = RidgeClassifier(fit_intercept=False)

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__solver": ('cholesky', 'sparse_cg', 'sag', 'saga'),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'RidgeClassifier', 'auc': 0.9660543202429157, 'best_params': '{"clf__solver": "sparse_cg"}'}
CPU times: total: 938 ms
Wall time: 3min 34s


### SVM

#### LinearSVC

In [20]:
%%time
clf = LinearSVC()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__loss": ('hinge', 'squared_hinge'),
    "clf__C": np.logspace(-5,0,6),
    "clf__tol": np.logspace(-5,0,6),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'LinearSVC', 'auc': 0.9674668783920553, 'best_params': '{"clf__C": 0.1, "clf__loss": "squared_hinge", "clf__tol": 0.1}'}
CPU times: total: 2.53 s
Wall time: 21.7 s


#### NuSVC

In [21]:
%%time
clf = NuSVC()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__nu": (0.1, 0.25, 0.5, 0.75, 1),
    "clf__kernel": ('linear', 'poly', 'rbf', 'sigmoid', 'precomputed'),
    "clf__tol": np.logspace(-5,0,6),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'NuSVC', 'auc': 0.9704121866805029, 'best_params': '{"clf__kernel": "rbf", "clf__nu": 0.25, "clf__tol": 1e-05}'}
CPU times: total: 1min
Wall time: 44min 29s


### Tree

#### Decision Tree

In [22]:
clf = DecisionTreeClassifier()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__splitter": ('best', 'random'),
    "clf__criterion": ('gini', 'entropy', 'log_loss'),
    "clf__max_features": ('sqrt', 'log2')
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'DecisionTreeClassifier', 'auc': 0.837621335002909, 'best_params': '{"clf__criterion": "log_loss", "clf__max_features": "sqrt", "clf__splitter": "best"}'}


#### Extremely randomized tree 

In [23]:
clf = ExtraTreeClassifier()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__splitter": ('best', 'random'),
    "clf__criterion": ('gini', 'entropy', 'log_loss'),
    "clf__max_features": ('sqrt', 'log2')
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'ExtraTreeClassifier', 'auc': 0.8413073840482257, 'best_params': '{"clf__criterion": "entropy", "clf__max_features": "sqrt", "clf__splitter": "best"}'}


### Ensemble

#### RandomForest

In [24]:
clf = RandomForestClassifier()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__n_estimators": (10, 100, 500),
    "clf__criterion": ('gini', 'entropy', 'log_loss'),
    "clf__max_features": ('sqrt', 'log2')
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'RandomForestClassifier', 'auc': 0.9708797237824582, 'best_params': '{"clf__criterion": "entropy", "clf__max_features": "log2", "clf__n_estimators": 500}'}


#### AdaBoost

In [25]:
clf = AdaBoostClassifier()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__n_estimators": (50, 100, 500),
    "clf__learning_rate": np.logspace(-5, 0, 6),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'AdaBoostClassifier', 'auc': 0.9491320075241912, 'best_params': '{"clf__learning_rate": 0.1, "clf__n_estimators": 500}'}


### Dummy

In [34]:
clf = DummyClassifier()

vectorizer = TfidfVectorizer()
pipe = Pipeline(steps = [('vectorizer',vectorizer),('clf',clf)])

parameter_grid = {
    # "vectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    # "vectorizer__min_df": (1, 3, 5, 10),
    # "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # "vectorizer__norm": ("l1", "l2"),
    "clf__strategy": ('most_frequent', 'prior', 'stratified', 'uniform', 'constant'),
}
scoring = {"AUC": "roc_auc"}

gs = GridSearchCV(pipe,parameter_grid,scoring=scoring,refit='AUC', n_jobs=10)
gs.fit(X_train, y_train)
save_best_params()

{'name': 'DummyClassifier', 'auc': 0.5067833660734065, 'best_params': '{"clf__strategy": "stratified"}'}


### Results

In [35]:
df = pd.DataFrame(gs_best_params)
df = df.sort_values(by='auc', ascending=False)
df

Unnamed: 0,name,auc,best_params
11,RandomForestClassifier,0.97088,"{""clf__criterion"": ""entropy"", ""clf__max_featur..."
8,NuSVC,0.970412,"{""clf__kernel"": ""rbf"", ""clf__nu"": 0.25, ""clf__..."
4,SGDClassifier,0.96827,"{""clf__alpha"": 1e-05, ""clf__loss"": ""log_loss"",..."
5,LogisticRegression,0.968178,"{""clf__C"": 10.0, ""clf__penalty"": ""l2""}"
7,LinearSVC,0.967467,"{""clf__C"": 0.1, ""clf__loss"": ""squared_hinge"", ..."
6,RidgeClassifier,0.966054,"{""clf__solver"": ""sparse_cg""}"
1,ComplementNB,0.959025,"{""clf__alpha"": 1.0}"
2,MultinomialNB,0.959025,"{""clf__alpha"": 1.0}"
3,KNeighborsClassifier,0.956073,"{""clf__n_neighbors"": 10, ""clf__p"": 2}"
0,BernoulliNB,0.955884,"{""clf__alpha"": 1.0}"
