# Preparação do ambiente

## Bibliotecas

In [38]:
import configparser
import hashlib
import json
import os
import pandas as pd
import requests
import time
import urllib.parse

from pathlib import Path
from tqdm.auto import tqdm

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

## Constantes e funções auxiliares

In [3]:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'

In [4]:
def get_new_access_token(actual_refresh_token,client_id,client_secret):
    url = 'https://api.mercadolibre.com/oauth/token'
    payload = {
        'grant_type': 'refresh_token', 
        'client_id': client_id,
        'client_secret': client_secret, 
        'refresh_token': actual_refresh_token
    }
        
    headers = {
        'accept': 'application/json',
        'content-type': 'application/x-www-form-urlencoded'
    }
        
    response = requests.post(url, headers=headers, data=payload)
    content = response.content.decode(response.encoding)
    content =  json.loads(content)
        
    return content['access_token'], content['refresh_token']

In [5]:
def load_sch(sch_database_file):
    
    # load SCH database
    usecols = [0,1,11,12,13,14,15]
    dtype = {'Número de Homologação': 'str'}
    parse_dates = [0]
    date_format = '%d/%m/%Y'

    df_sch = pd.read_csv(
        sch_database_file,
        sep=';',
        usecols=usecols,
        dtype=dtype,
        parse_dates=parse_dates,
        date_format=date_format
        )

    df_sch = df_sch.sort_values(by='Data da Homologação',ascending=False)

    df_modelo = df_sch[['Número de Homologação','Modelo']].dropna()
    df_modelo = df_modelo.groupby('Número de Homologação',as_index=False)['Modelo'].apply(lambda x: ' | '.join(x))
    df_modelo = df_modelo.drop_duplicates(subset='Número de Homologação')

    df_nome_comercial = df_sch[['Número de Homologação','Nome Comercial']].dropna()
    df_nome_comercial = df_nome_comercial.groupby('Número de Homologação',as_index=False)['Nome Comercial'].apply(lambda x: ' | '.join(x))
    df_nome_comercial = df_nome_comercial.drop_duplicates(subset='Número de Homologação')

    columns_to_keep = ['Data da Homologação', 'Número de Homologação', 'Nome do Fabricante']
    df_sch_models = df_sch[columns_to_keep]
    df_sch_models = df_sch_models.drop_duplicates(subset='Número de Homologação')

    df_sch_models = df_sch_models.merge(df_modelo,how='left')
    df_sch_models = df_sch_models.merge(df_nome_comercial,how='left')
    df_sch_models = df_sch.fillna('')
    df_sch_models['Modelo Completo'] = df_sch_models[['Modelo', 'Nome Comercial']].apply(lambda row: ' | '.join(row) if len(row['Nome Comercial'])>0 else row['Modelo'],axis=1)

    columns_to_keep = ['Número de Homologação', 'Nome do Fabricante', 'Modelo Completo']
    df_sch_models = df_sch_models.drop_duplicates(subset='Número de Homologação')
    df_sch_models = df_sch_models[columns_to_keep].reset_index(drop=True)
    
    return df_sch, df_sch_models

In [20]:
def parse_results(result, return_anatel_homologation_number=False):
    keys_to_keep = ['id', 'title', 'catalog_product_id', 'permalink', 'category_id', 'domain_id', 
                    'currency_id', 'price', 'original_price', 
                    'initial_quantity', 'available_quantity', 'official_store_id', 'official_store_name']
    
    parsed_result = {}
    for key in keys_to_keep:
        parsed_result[key] = result.get(key)
    
    if result.get('seller') is not None:
        parsed_result['seller_id'] = result.get('seller').get('id')
        parsed_result['seller_nickname'] = result.get('seller').get('nickname')
    else:
        parsed_result['seller_id'] = None
        parsed_result['seller_nickname'] = None    

    brand = None
    model = None
    gtin = None
    detailed_model = None
    anatel_homologation_number = None
    cellphones_anatel_homologation_number = None
    
    for attribute in result['attributes']:
        if attribute['id'] == 'BRAND':
            brand = attribute['value_name']
        elif attribute['id'] == 'MODEL':
            model = attribute['value_name']
        elif attribute['id'] == 'DETAILED_MODEL':
            detailed_model = attribute['value_name'] 
        elif attribute['id'] == 'GTIN':
            gtin = attribute['value_name'] 
        elif attribute['id'] == 'ANATEL_HOMOLOGATION_NUMBER':
            anatel_homologation_number = attribute['value_name']
        elif attribute['id'] == 'CELLPHONES_ANATEL_HOMOLOGATION_NUMBER':
            cellphones_anatel_homologation_number = attribute['value_name']

    if return_anatel_homologation_number:
        if anatel_homologation_number is None:
            item_id = parsed_result['id']
            attr_url = f'https://api.mercadolibre.com/items/{item_id}?attributes=attributes&include_internal_attributes=true'
            headers = {'user-agent': USER_AGENT}
            attr_response = requests.get(attr_url,headers=headers)
            attr_content = attr_response.content.decode(attr_response.encoding)
            attr_content =  json.loads(attr_content)
            attr_content = attr_content['attributes']
            for attr in attr_content:
                if attr['id'] == 'ANATEL_HOMOLOGATION_NUMBER':
                    anatel_homologation_number = attr['value_name']
    
        if cellphones_anatel_homologation_number is None:
            item_id = parsed_result['id']
            attr_url = f'https://api.mercadolibre.com/items/{item_id}?attributes=attributes&include_internal_attributes=true'
            headers = {'user-agent': USER_AGENT}
            attr_response = requests.get(attr_url,headers=headers)
            attr_content = attr_response.content.decode(attr_response.encoding)
            attr_content =  json.loads(attr_content)
            attr_content = attr_content['attributes']
            for attr in attr_content:
                if attr['id'] == 'CELLPHONES_ANATEL_HOMOLOGATION_NUMBER':
                    cellphones_anatel_homologation_number = attr['value_name']
    else:
        anatel_homologation_number = '999999999999'
        cellphones_anatel_homologation_number = '999999999999'

    parsed_result['brand'] = brand
    parsed_result['model'] = model
    parsed_result['detailed_model'] = detailed_model
    parsed_result['gtin'] = gtin
    parsed_result['anatel_homologation_number'] = anatel_homologation_number
    parsed_result['cellphones_anatel_homologation_number'] = cellphones_anatel_homologation_number
    
    return parsed_result

# Carga e prepação dos dados

In [7]:
sch_database_file = '../../certificacao-homologacao/schwebsearch/datasets/sch_database/produtos_certificados.zip'
df_sch, df_sch_models = load_sch(sch_database_file)
df_sch

Unnamed: 0,Data da Homologação,Número de Homologação,Nome do Fabricante,Modelo,Nome Comercial,Categoria do Produto,Tipo do Produto
136048,2024-05-20,030572416551,Decathlon,8605113,W900,2,Transceptor de Radiação Restrita
138597,2024-05-20,004732404809,Elsys Equipamentos Eletrônicos Ltda,ESF-DE5100I,,2,Sistemas de Identificação por Radiofrequências
137797,2024-05-20,061542414894,"Shenzhen Baseus Technology Co., Ltd.",PPAP2-10A,,1,Acessório p/ Telefone Móvel Celular do tipo Ba...
140046,2024-05-20,050542403757,Lear Corporation,KOBJXF23A,,2,Sistemas Operando nas Faixas de RF Ultra Larga
137799,2024-05-20,062092408867,"Fortinet, Inc.",FG-120G,,3,Equipamento de Rede de Dados
...,...,...,...,...,...,...,...
378,2001-09-21,020050101504,Nortel Networks Inc.,Metrocell / 800 MFRM,Metrocell / 800 MFRM,3,Transceptor para Estação Rádio Base
332,2001-09-21,020060101504,Nortel Networks Inc.,Minicell / 800 MFRM,Minicell / 800 MFRM,3,Transceptor para Estação Rádio Base
345,2001-09-19,020020100563,Nokia do Brasil Tecnologia Ltda.,3320,3320,1,Telefone Móvel Celular
409,2001-09-19,020030100563,Nokia do Brasil Tecnologia Ltda.,Freedom TD1000,Freedom TD1000,1,Telefone Móvel Celular


In [8]:
df_sch_models

Unnamed: 0,Número de Homologação,Nome do Fabricante,Modelo Completo
0,030572416551,Decathlon,8605113 | W900
1,004732404809,Elsys Equipamentos Eletrônicos Ltda,ESF-DE5100I
2,061542414894,"Shenzhen Baseus Technology Co., Ltd.",PPAP2-10A
3,050542403757,Lear Corporation,KOBJXF23A
4,062092408867,"Fortinet, Inc.",FG-120G
...,...,...,...
69916,020050101504,Nortel Networks Inc.,Metrocell / 800 MFRM | Metrocell / 800 MFRM
69917,020060101504,Nortel Networks Inc.,Minicell / 800 MFRM | Minicell / 800 MFRM
69918,020020100563,Nokia do Brasil Tecnologia Ltda.,3320 | 3320
69919,020030100563,Nokia do Brasil Tecnologia Ltda.,Freedom TD1000 | Freedom TD1000


In [9]:
file_ean_celulares = '../datasets/lista_celulares_homologados_ean.xlsx'

dtype = {'Número de Homologação': 'str', 'Código EAN': 'str'}

df_ean_celulares = pd.read_excel(file_ean_celulares,dtype=dtype)
df_ean_celulares
df_ean_celulares = df_ean_celulares[df_ean_celulares['Código EAN'].str.len()>=13]
df_ean_celulares[df_ean_celulares['Código EAN'].str.len()>=13]
df_ean_celulares['Código EAN'] = df_ean_celulares['Código EAN'].apply(lambda x:  x.strip())

columns_to_keep = ['Código EAN', 'Número de Homologação']
df_ean_sch = df_ean_celulares[columns_to_keep].drop_duplicates()
# df_ean_sch = df_ean_sch.dropna().reset_index(drop=True)
# df_ean_sch = df_ean_sch.astype('int64')

df_ean_sch.columns = ['ean_sch', 'sch_sch']
df_ean_sch.head()

Unnamed: 0,ean_sch,sch_sch
0,7892597349623,22972000330
1,7892597349630,22972000330
2,7892597350971,13692100330
3,7892597350988,13692100330
4,7892597351749,13692100330


In [10]:
creds_file = Path(os.environ['USERPROFILE'],'creds.ini')
creds = configparser.ConfigParser()
creds.read(creds_file)  

client_id = creds['MERCADO_LIVRE']['client_id']
client_secret = creds['MERCADO_LIVRE']['client_secret']
actual_refresh_token = creds['MERCADO_LIVRE']['actual_refresh_token']
actual_access_token = creds['MERCADO_LIVRE']['actual_access_token']

In [11]:
def update_access_token():

    global actual_access_token
    global actual_refresh_token
    
    actual_access_token, actual_refresh_token = get_new_access_token(actual_refresh_token,client_id,client_secret)
    
    creds['MERCADO_LIVRE']['actual_access_token'] = actual_access_token
    creds['MERCADO_LIVRE']['actual_refresh_token'] = actual_refresh_token
    
    with open(creds_file, 'w') as file:
        creds.write(file)

# Análise

## Pesquisa categoria *"Celulares e Smartphones"*

In [12]:
update_access_token()

In [46]:
# Get first page of results
url = 'https://api.mercadolibre.com/sites/MLB/search?category=MLB1055'
params = {'offset': 0}
headers = {
  'Authorization': f'Bearer {actual_access_token}'
}

response = requests.request("GET", url, headers=headers, params=params)
content = response.content.decode(response.encoding)
content =  json.loads(content)
category_results = content['results']

total_items = content['paging']['total']
next_offsets = list(range(50,total_items+1,50))

print(f'Total itens found: {total_items}')

Total itens found: 63812


In [47]:
for offset in tqdm(next_offsets):
    
    params = {'offset': offset}
    response = requests.request("GET", url, headers=headers, params=params)   
    
    if response.status_code == 200:
        content = response.content.decode(response.encoding)
        content =  json.loads(content)
        category_results.extend(content['results'])
    else:
        break

  0%|          | 0/1276 [00:00<?, ?it/s]

In [48]:
category_parsed_results = [parse_results(result) for result in category_results]
df_cellphones = pd.DataFrame(category_parsed_results)
df_brand_models = df_cellphones[['brand', 'model']].drop_duplicates()
models_to_seach = df_brand_models.apply(lambda row: ' '.join(row),axis=1).to_list()
print('Total models to search:', len(models_to_seach))

Total models to search: 1853


## Pesquisa por modelos

In [None]:
brand_model_results = []

for model in tqdm(models_to_seach):

    query = urllib.parse.quote_plus(model)
    
    url = 'https://api.mercadolibre.com/sites/MLB/search'
    params={'q': query, 'offset': 0}
    headers = {'user-agent': USER_AGENT}
    
    response = requests.get(url,params=params,headers=headers)
    content = response.content.decode(response.encoding)
    content =  json.loads(content)
    brand_model_results.extend(content['results'])
    
    total_items = content['paging']['total']
    next_offsets = list(range(50,total_items+1,50))

    # Max offset for public API is 1000
    for offset in tqdm(next_offsets[:20],leave=False):
        params={'q': query, 'offset': offset}
        response = requests.get(url,params=params,headers=headers) 
        
        if response.status_code == 200:
            content = response.content.decode(response.encoding)
            content =  json.loads(content)
            brand_model_results.extend(content['results'])
        else:
            break


  0%|          | 0/1853 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
with open('brand_model_results.json', 'w') as f:
    json.dump(brand_model_results,f,indent=2)

In [None]:
brand_model_parsed_results = [parse_results(result,True) for result in tqdm(brand_model_results)]
df_brand_model = pd.DataFrame(brand_model_parsed_results)
df_brand_model.shape

## Consolidação e análise

In [22]:
df_cellphones = pd.DataFrame(parsed_results)
df_cellphones.to_parquet('cellphone.parquet')

columns_to_keep = ['id', 'permalink', 'title','seller_nickname', 'brand', 'model', 'gtin', 'cellphones_anatel_homologation_number']
df_analise = df_cellphones[columns_to_keep]

columns_to_keep = ['id', 'permalink', 'title', 'seller_nickname', 'brand', 'model', 'ean_anuncio', 'sch_anuncio']
df_analise.columns = columns_to_keep

df_analise['ean_anuncio'] = df_analise['ean_anuncio'].str.split(',')
df_analise = df_analise.explode('ean_anuncio')

# df_analise['ean_presente'] = df_analise['ean_mp'].apply(lambda x:  0 if x is None else 1)
df_analise['sch_presente'] = df_analise['sch_anuncio'].apply(lambda x:  0 if x is None else 1)

df_analise['sch_anuncio'] = df_analise['sch_anuncio'].fillna('0').str.zfill(12)
df_analise['ean_anuncio'] = df_analise['ean_anuncio'].fillna('0').str.zfill(13)

df_analise = df_analise.merge(df_ean_sch,left_on='ean_anuncio', right_on='ean_sch', how='left')
df_analise['ean_ok'] = df_analise['ean_anuncio']==df_analise['ean_sch']
df_analise['sch_ok'] = df_analise['sch_anuncio']==df_analise['sch_sch']
df_analise['ean_sch_ok'] = df_analise[['ean_ok', 'sch_ok']].apply(lambda row: all(row),axis=1)
columns_to_keep = ['id', 'permalink', 'title', 'seller_nickname', 'brand', 'model', 'ean_anuncio', 'sch_anuncio', 'sch_presente', 'ean_sch', 'sch_sch', 'ean_sch_ok']
df_analise = df_analise[columns_to_keep]


df_analise = df_analise.merge(df_sch_models, left_on='sch_anuncio', right_on='Número de Homologação',how='left')
df_analise['sch_valido'] = df_analise['Número de Homologação'].apply(lambda x:  0 if pd.isna(x) else 1)
columns_to_keep = ['id', 'permalink', 'title', 'seller_nickname', 'brand', 'model', 'ean_anuncio', 'sch_anuncio', 'sch_presente', 'ean_sch', 'sch_sch', 'ean_sch_ok', 'sch_valido', 'Nome do Fabricante', 'Modelo Completo']
df_analise = df_analise[columns_to_keep]

df_analise

Unnamed: 0,id,permalink,title,seller_nickname,brand,model,ean_anuncio,sch_anuncio,sch_presente,ean_sch,sch_sch,ean_sch_ok,sch_valido,Nome do Fabricante,Modelo Completo
0,MLB4587064722,https://www.mercadolivre.com.br/motorola-moto-...,Motorola Moto G24 Power Dual Sim 128gb Azul 4g...,MOTOROLA OFICIAL,Motorola,G24 Power Dual SIM,7892597354290,999999999999,1,7892597354290,029772400330,False,0,,
1,MLB3589269609,https://www.mercadolivre.com.br/samsung-galaxy...,Samsung Galaxy A15 4g Dual Sim 128 Gb Azul Esc...,MERCADOLIVRE ELETRONICOS,Samsung,A15 4G,7892509134262,999999999999,1,7892509134262,198252300953,False,0,,
2,MLB3609791441,https://www.mercadolivre.com.br/samsung-galaxy...,Samsung Galaxy A15 5g Dual Sim 128gb Azul-escu...,MERCADOLIVRE ELETRONICOS,Samsung,A15 5G Dual Sim,7892509134705,999999999999,1,7892509134705,198242300953,False,0,,
3,MLB3683512465,https://www.mercadolivre.com.br/motorola-moto-...,Motorola Moto G04s 128gb Cinza 4gb Ram,MERCADOLIVRE ELETRONICOS,Motorola,G04S,7892597354160,999999999999,1,7892597354160,032192400330,False,0,,
4,MLB4481510718,https://www.mercadolivre.com.br/smartphone-mot...,Smartphone Motorola Moto G04 128gb 8gb Ram Boo...,MOTOROLA OFICIAL,Motorola,Moto G04,7892597353699,999999999999,1,7892597353699,205312300330,False,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3914,MLB4716382930,https://produto.mercadolivre.com.br/MLB-471638...,Smartphone Samsung Galaxy A02 32gb 2gb Ram Gar...,BPSTORE,Samsung,A02 Dual SIM,0000000000000,999999999999,1,,,False,0,,
3915,MLB3755770999,https://produto.mercadolivre.com.br/MLB-375577...,Xiaomi 13 Dual Sim 256 Gb White 12 Gb Ram Global,4RUN BRASIL,Xiaomi,13,0000000000000,999999999999,1,,,False,0,,
3916,MLB4836132910,https://produto.mercadolivre.com.br/MLB-483613...,Celular Nokia 110 4g Dual Chip Radio Fm Blueto...,MAGAZINECRISTINA,Nokia,110 4G,0000000000000,999999999999,1,,,False,0,,
3917,MLB4665752520,https://produto.mercadolivre.com.br/MLB-466575...,Smartphone Redimi 13c 256/8gb Lacrado +nf E Fo...,CELLTECH STORE,Redmi,13C,0000000000000,999999999999,1,,,False,0,,


In [None]:
df_analise[df_analise['ean_anuncio']=='0000000000000'].shape

In [None]:
columns_to_group = ['id', 'sch_presente', 'sch_valido', 'ean_sch_ok']

agg_func = {'id': 'count', 'sch_presente': 'sum', 'sch_valido': 'sum', 'ean_sch_ok': 'sum'}
df_resumo = pd.DataFrame(df_analise[columns_to_group].agg(agg_func)).T

df_resumo['pe_sch_presente'] = df_resumo['sch_presente']/df_resumo['id']*100
df_resumo['pe_sch_valido'] = df_resumo['sch_valido']/df_resumo['id']*100
df_resumo['pe_ean_sch_ok'] = df_resumo['ean_sch_ok']/df_resumo['id']*100

columns_to_keep = ['Total de anúncios', 'Anúncios com código SCH Informado', 'Anúncios com código SCH Válido', 'Anúncios com par EAN/SCH Válido',
                   '% Anúncios com código SCH Informado', '% Anúncios com código SCH Válido', '% Anúncios com par EAN/SCH Válido']

df_resumo.columns = columns_to_keep
df_resumo.index.name = 'Mercado Livre'

df_resumo

In [None]:
columns_to_group = ['seller_nickname', 'id', 'sch_presente', 'sch_valido', 'ean_sch_ok']
agg_func = {'id': 'count', 'sch_presente': 'sum', 'sch_valido': 'sum', 'ean_sch_ok': 'sum'}
df_seller = df_analise[columns_to_group].groupby(columns_to_group[0]).agg(agg_func)

df_seller['pe_sch_presente'] = df_seller['sch_presente']/df_seller['id']*100
df_seller['pe_sch_valido'] = df_seller['sch_valido']/df_seller['id']*100
df_seller['pe_ean_sch_ok'] = df_seller['ean_sch_ok']/df_seller['id']*100

columns_to_keep = ['Total de anúncios', 'Anúncios com EAN Informado', 'Anúncios com código SCH Informado', 'Anúncios com código SCH Válido', 
                   '% Anúncios com EAN Informado', '% Anúncios com código SCH Informado', '% Anúncios com código SCH Válido']

df_seller.columns = columns_to_keep
df_seller.index.name = 'Vendedor'

df_seller.sort_values(by='Total de anúncios',ascending=False).head(10)

In [None]:
columns_to_group = ['brand', 'id', 'sch_presente', 'sch_valido', 'ean_sch_ok']
agg_func = {'id': 'count', 'sch_presente': 'sum', 'sch_valido': 'sum', 'ean_sch_ok': 'sum'}
df_brand = df_analise[columns_to_group].groupby(columns_to_group[0]).agg(agg_func)

df_brand['pe_sch_presente'] = df_brand['sch_presente']/df_brand['id']*100
df_brand['pe_sch_valido'] = df_brand['sch_valido']/df_brand['id']*100
df_brand['pe_ean_sch_ok'] = df_brand['ean_sch_ok']/df_brand['id']*100

columns_to_keep = ['Total de anúncios', 'Anúncios com EAN Informado', 'Anúncios com código SCH Informado', 'Anúncios com código SCH Válido', 
                   '% Anúncios com EAN Informado', '% Anúncios com código SCH Informado', '% Anúncios com código SCH Válido']

df_brand.columns = columns_to_keep
df_brand.index.name = 'Fabricante'

df_brand.sort_values(by='Total de anúncios',ascending=False).head(10)