# Preparação do ambiente

## Bibliotecas

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import unidecode
import warnings 

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.decomposition import TruncatedSVD
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from string import punctuation

In [2]:
# Settings the warnings to be ignored 
warnings.filterwarnings('ignore')

## Constantes e funções auxiliares

In [3]:
def tokenizer(doc):
    doc = doc.lower()
    doc = unidecode.unidecode(doc)
    
    # uniformizar termos
    patterns = [('wi\s?.?fi', 'wifi'), 
                ('2,4', '2.4'), 
                ('5,8', '5.8')]
    for pattern, repl in patterns:
        doc = re.sub(pattern, repl, doc)
    
    stop_words = stopwords.words('portuguese')
    stop_words.extend(stopwords.words('english'))
    stop_words.extend(list(punctuation))
    # stopwords específicas do domínio
    stop_words.extend(['cm', 'feature', 'features', 'informações', 'itens', 'leve', 'list', 'nulo', 'package', 'pacote', 'pacotes', 'recurso', 'tamanho', 'ver', 'unidades', 'fio', 'universal'])
    # cores mais comumns
    stop_words.extend(['preto', 'cinza', 'branco', 'rosa', 'vermelho', 'laranja', 'amarelo', 'verde', 'azul', 'roxo', 'marrom'])
    # remover da lista de stopwords a palavra sem para formar o bigrama "sem fio", que pode ser relevante para o domínio
    # stop_words.remove('sem') 
    
    tokens = [token for token in word_tokenize(doc) if token not in stop_words]
    return tokens

def doc_cleaner(doc):
    tokens = tokenizer(doc)
    return ' '.join(tokens) 

# Carga e prepação dos dados

In [4]:
file_labeled_samples = '../datasets/amazon/targets/amazon_labeled_samples.xlsx'
df_labeled_samples = pd.read_excel(file_cat_sample)

df_labeled_samples = df_labeled_samples.iloc[:,-2:]
df_labeled_samples = df_labeled_samples.dropna()
df_labeled_samples.columns = ['doc', 'labeled_category']

map_telecom_product = {'Antena': 1,  
                       'Baterias (eletrônicos em geral)': 0, 
                       'Cabo (em geral)': 0, 
                       'Cabo de rede': 1, 
                       'Câmera sem fio': 1, 
                       'Carregador de celular': 1,
                       'Carregador portátil (powerbank)': 1,
                       'Carregador sem fio': 1, 
                       'Carregadores e fontes (eletrônicos em geral)': 0, 
                       'Cartão de memória': 0,
                       'Desktop/Notebook': 0, 
                       'Drone': 1, 
                       'Extensor de vídeo VGA/HDMI': 0, 
                       'Fone de ouvido sem fio': 1, 
                       'Hub Switch': 1, 
                       'Impressora e cartuchos': 0,
                       'Microfone sem fio': 1, 
                       'Modem 3G/4G/WiFi': 1, 
                       'Outros': 0, 
                       'Reforçador de sinal de celular': 1, 
                       'Roteador/Reforçador WiFi': 1, 
                       'Smart TV': 1,
                       'Smartwatch': 1, 
                       'Tablets': 1, 
                       'Telefone celular': 1, 
                       'Transceptor de Radiação Restrita': 1, 
                       'Transceptor de RF (HF/VHF/UHF)': 1, 
                       'Transceptor/Conversor SFP': 1, 
                       'TV Box': 1}

map_rf_transmitter = {'Antena': 1,  
                      'Baterias (eletrônicos em geral)': 0, 
                      'Cabo (em geral)': 0, 
                      'Cabo de rede': 1, 
                      'Câmera sem fio': 2, 
                      'Carregador de celular': 1,
                      'Carregador portátil (powerbank)': 1,
                      'Carregador sem fio': 1, 
                      'Carregadores e fontes (eletrônicos em geral)': 0, 
                      'Cartão de memória': 0,
                      'Desktop/Notebook': 0, 
                      'Drone': 2, 
                      'Extensor de vídeo VGA/HDMI': 0, 
                      'Fone de ouvido sem fio': 2, 
                      'Hub Switch': 1, 
                      'Impressora e cartuchos': 0,
                      'Microfone sem fio': 2, 
                      'Modem 3G/4G/WiFi': 2, 
                      'Outros': 0,                        
                      'Reforçador de sinal de celular': 2, 
                      'Roteador/Reforçador WiFi': 2, 
                      'Smart TV': 2,
                      'Smartwatch': 2, 
                      'Tablets': 2, 
                      'Telefone celular': 2, 
                      'Transceptor de Radiação Restrita': 2, 
                      'Transceptor de RF (HF/VHF/UHF)': 2, 
                      'Transceptor/Conversor SFP': 1, 
                      'TV Box': 2}

df_labeled_samples['telecom_product'] = df_labeled_samples['labeled_category'].map(map_telecom_product)
df_labeled_samples['rf_transmitter'] = df_labeled_samples['labeled_category'].map(map_rf_transmitter)

map_category_to_int = {category:i for i,category in enumerate(df_labeled_samples.labeled_category.unique())}
map_category_to_name = {i:category for category,i in map_category_to_int.items()}
df_cat_sample['category'] = df_cat_sample['labeled_category'].map(map_category_to_int)

df_cat_sample['clean_doc'] = df_cat_sample.doc.map(doc_cleaner)
columns_to_keep = ['doc', 'clean_doc', 'labeled_category', 'telecom_product', 'rf_transmitter', 'category']
df_cat_sample = df_cat_sample[columns_to_keep]

df_cat_sample

NameError: name 'file_cat_sample' is not defined

# Análise

Experimento de classificação simples: uma pequena amostra de anúncios foi rotulada manual para ser submetida a algoritmos de classificação para verificação dos resultados iniciais.

## Classificação binária

### Produtos de telecomunicações x Outros

In [None]:
X = df_cat_sample['doc']
y = df_cat_sample['telecom_product']
target_names = ['Produtos comuns', 'Produtos de telecomunicações']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=724)

In [None]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
clf_dummy = DummyClassifier()
clf_dummy.fit(X_train_tfidf,y_train)
predicted = clf_dummy.predict(X_test_tfidf)

print('Accuracy of Dummy classifier on training set: {:.2f}'
     .format(clf_dummy.score(X_train, y_train)))
print('Accuracy of Dummy classifier on test set: {:.2f}'
     .format(clf_dummy.score(X_test, y_test)))

print(classification_report(y_test, predicted, target_names=target_names))

In [None]:
clf_sgd = SGDClassifier(loss='log_loss', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None)

clf_sgd.fit(X_train_tfidf, y_train)
predicted = clf_sgd.predict(X_test_tfidf)

print('Accuracy of SGD classifier on training set: {:.3f}'
     .format(clf_sgd.score(X_train_tfidf, y_train)))
print('Accuracy of SGD classifier on test set: {:.3f}'
     .format(clf_sgd.score(X_test_tfidf, y_test)))
print('AUC of SGD classifier on test set: {:.3f}'
     .format(roc_auc_score(y_test,clf_sgd.predict_proba(X_test_tfidf)[:,1])))
print()
print(classification_report(y_test, predicted, target_names=target_names))

$Precision = \frac{TP}{TP+FP}$ quanto maior, menor a quantidade de falsos positivos

$Recall = \frac{TP}{TP+FN}$ quanto maior, menor a quantidade de falsos negativos

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lgr = LogisticRegression()

clf_lgr.fit(X_train_tfidf, y_train)
predicted = clf_lgr.predict(X_test_tfidf)

print('Accuracy of SGD classifier on training set: {:.3f}'
     .format(clf_lgr.score(X_train_tfidf, y_train)))
print('Accuracy of SGD classifier on test set: {:.3f}'
     .format(clf_lgr.score(X_test_tfidf, y_test)))
print('AUC of SGD classifier on test set: {:.3f}'
     .format(roc_auc_score(y_test,clf_lgr.predict_proba(X_test_tfidf)[:,1])))
print()
print(classification_report(y_test, predicted, target_names=target_names))

In [None]:
pipe_sgd = Pipeline([('tfidf',TfidfVectorizer(tokenizer=tokenizer,ngram_range=(1,2))),
                     ('clf', SGDClassifier(alpha=1e-5, 
                                           loss='hinge', 
                                           max_iter=100, 
                                           penalty='l2', 
                                           random_state=42, 
                                           tol=None))])
pipe_sgd.fit(X_train,y_train)
predicted = pipe_sgd.predict(X_test)

print('Accuracy of SGD classifier on training set: {:.3f}'
     .format(pipe_sgd.score(X_train, y_train)))
print('Accuracy of SGD classifier on test set: {:.3f}'
     .format(pipe_sgd.score(X_test, y_test)))
print('AUC of SGD classifier on test set: {:.3f}'
     .format(roc_auc_score(y_test,pipe_sgd.decision_function(X_test))))
print()
print(classification_report(y_test, predicted, target_names=target_names))

In [None]:
# modelo anterior aparentemente overfitting, reduzir alpha
pipe_sgd = Pipeline([('tfidf',TfidfVectorizer(tokenizer=tokenizer,ngram_range=(1,2))),
                     ('clf', SGDClassifier(alpha=1e-3, 
                                           loss='hinge', 
                                           max_iter=100, 
                                           penalty='l2', 
                                           random_state=42, 
                                           tol=None))])

pipe_sgd.fit(X_train,y_train)
predicted = pipe_sgd.predict(X_test)

print('Accuracy of SGD classifier on training set: {:.3f}'
     .format(pipe_sgd.score(X_train, y_train)))
print('Accuracy of SGD classifier on test set: {:.3f}'
     .format(pipe_sgd.score(X_test, y_test)))
print('AUC of SGD classifier on test set: {:.3f}'
     .format(roc_auc_score(y_test,pipe_sgd.decision_function(X_test))))
print()
print(classification_report(y_test, predicted, target_names=target_names))

### Produtos transmissores de RF x Outros

In [None]:
map_rf_transmitter = {0: 0, 1: 0, 2: 1}
X = df_cat_sample['doc']
y = df_cat_sample['rf_transmitter'].map(map_rf_transmitter)

target_names = ['Produtos comuns', 'Produtos transmissores de RF']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=724)

pipe_sgd_rf = Pipeline([('tfidf',TfidfVectorizer(tokenizer=tokenizer,ngram_range=(1,2))),
                        ('clf', SGDClassifier(alpha=1e-3, 
                                              loss='hinge', 
                                              max_iter=100, 
                                              penalty='l2', 
                                              random_state=42, 
                                              tol=None))])

pipe_sgd_rf.fit(X_train,y_train)
predicted = pipe_sgd_rf.predict(X_test)

print('Accuracy of SGD classifier on training set: {:.3f}'
     .format(pipe_sgd_rf.score(X_train, y_train)))
print('Accuracy of SGD classifier on test set: {:.3f}'
     .format(pipe_sgd_rf.score(X_test, y_test)))
print('AUC of SGD classifier on test set: {:.3f}'
     .format(roc_auc_score(y_test,pipe_sgd_rf.decision_function(X_test))))
print()
print(classification_report(y_test, predicted, target_names=target_names))

## Classificação em 3 classes

In [None]:
X = df_cat_sample['doc']
y = df_cat_sample['rf_transmitter']
target_names = ['Produtos comuns', 'Produtos de telecomunicações', 'Transmissores de RF']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=724)

In [None]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
clf_dummy = DummyClassifier()
clf_dummy.fit(X_train_tfidf,y_train)
predicted = clf_dummy.predict(X_test_tfidf)

print('Accuracy of Dummy classifier on training set: {:.2f}'
     .format(clf_dummy.score(X_train_tfidf, y_train)))
print('Accuracy of Dummy classifier on test set: {:.2f}'
     .format(clf_dummy.score(X_test_tfidf, y_test)))

print(classification_report(y_test, predicted, target_names=target_names))

In [None]:
clf_sgd = SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None)

clf_sgd.fit(X_train_tfidf, y_train)
predicted = clf_sgd.predict(X_test_tfidf)

print('Accuracy of SGD classifier on training set: {:.2f}'
     .format(clf_sgd.score(X_train_tfidf, y_train)))
print('Accuracy of SGD classifier on test set: {:.2f}'
     .format(clf_sgd.score(X_test_tfidf, y_test)))

print(classification_report(y_test, predicted, target_names=target_names))

In [None]:
clf_lgr = LogisticRegression()

clf_lgr.fit(X_train_tfidf, y_train)
predicted = clf_lgr.predict(X_test_tfidf)

print('Accuracy of SGD classifier on training set: {:.2f}'
     .format(clf_lgr.score(X_train_tfidf, y_train)))
print('Accuracy of SGD classifier on test set: {:.2f}'
     .format(clf_lgr.score(X_test_tfidf, y_test)))

print(classification_report(y_test, predicted, target_names=target_names))

# Conclusão

Deste experimento extrai-se a viabilidade de construção de um classificador para identificar os produtos para telecomunicações (classificação binária).