# Preparação do ambiente

## Bibliotecas

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import unidecode
import warnings 

# from joblib import load, dump
from joblib import load, dump

from nltk import download
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from string import punctuation

from wordcloud import WordCloud

## Constantes e funções auxiliares

In [2]:
FILE_MARKETPLACES = '../datasets/samples_labeling/issues_fiscaliza/20240423/results/labels.parquet'
FILE_SUPERVISAO_MERCADO = '../datasets/inspecao_ecommerce/supervisao_mercado.xlsx'

FILE_MODEL = '../models/experimento_regulatron/exp_clf31_model.joblib'

# Carga e prepação dos dados

In [3]:
df_marketplaces = pd.read_parquet(FILE_MARKETPLACES)

# conjunto de dados
docs = df_marketplaces[df_marketplaces['passivel_homologacao']<2]['titulo']
targets = df_marketplaces[df_marketplaces['passivel_homologacao']<2]['passivel_homologacao']

X_train, X_test, y_train, y_test = train_test_split(docs, targets, test_size=0.25, random_state=724)

In [4]:
dict_df_mercado = pd.read_excel(
    FILE_SUPERVISAO_MERCADO,
    sheet_name=None,
    usecols=[2,8,10],
    names=['texto_busca', 'titulo', 'passivel_homologacao'],
    true_values=['Sim', 'sim'],
    false_values=['Não','não'],
    na_values=['-'])

df_list = []
for key in dict_df_mercado.keys():
    df = dict_df_mercado[key]
    df['marketplace'] = key
    df_list.append(df)
    
df_mercado = pd.concat(df_list)
df_mercado = df_mercado.dropna()
df_mercado['passivel_homologacao'] = df_mercado['passivel_homologacao'].astype(int)

map_marketplaces = {
    'Amazon': 'Amazon', 
    'Americanas': 'Lojas Americanas',
    'CasasBahia': 'Casas Bahia',
    'Magalu': 'Magazine Luiza', 
    'MercadoLivre': 'Mercado Livre'
}

df_mercado['marketplace'] = df_mercado['marketplace'].map(map_marketplaces)
X_valid, y_valid = df_mercado['titulo'],df_mercado['passivel_homologacao'] 

In [5]:
model = load(FILE_MODEL)
clf = model['clf']

feature_names = model['vectorizer'].get_feature_names_out()
feature_vectors = model['transformer'].transform(model['vectorizer'].transform(X_train))

# Análise

In [6]:
# learned coefficients weighted by frequency of appearance
average_feature_effects = clf.coef_.ravel() * np.asarray(feature_vectors.mean(axis=0)).ravel()

features_indexes = np.argsort(average_feature_effects)[::-1]
top_features_indexes = features_indexes[:10]
features_effects = np.vstack([feature_names[top_features_indexes], average_feature_effects[top_features_indexes]]).T
df_features_effects = pd.DataFrame(features_effects, columns=['feature','effect'])

df_features_effects

Unnamed: 0,feature,effect
0,bluetooth,0.286337
1,fio,0.175711
2,wifi,0.118603
3,transmissor,0.0841
4,carregador,0.08368
5,smartphone,0.052767
6,wireless,0.041821
7,ram,0.035423
8,receptor,0.034706
9,4k,0.032518
