# Preparação do ambiente

## Bibliotecas

In [1]:
import configparser
import hashlib
import json
import os
import pandas as pd
import requests
import time
import urllib.parse
import uuid

from datetime import datetime
from pathlib import Path
from tqdm.auto import tqdm

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

## Constantes e funções auxiliares

In [15]:
DATASETS_ROOT_PATH = Path('../datasets/mercadolivre/20240710')

search_results_folder = DATASETS_ROOT_PATH / 'search_results'
item_details_folder = DATASETS_ROOT_PATH / 'item_details'

In [4]:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'

In [5]:
def get_new_access_token(actual_refresh_token,client_id,client_secret):
    url = 'https://api.mercadolibre.com/oauth/token'
    payload = {
        'grant_type': 'refresh_token', 
        'client_id': client_id,
        'client_secret': client_secret, 
        'refresh_token': actual_refresh_token
    }
        
    headers = {
        'accept': 'application/json',
        'content-type': 'application/x-www-form-urlencoded'
    }
        
    response = requests.post(url, headers=headers, data=payload)
    content = response.content.decode(response.encoding)
    content =  json.loads(content)
        
    return content['access_token'], content['refresh_token']

In [6]:
def save_content_to_file(content, search_results_folder='./search_results'):
        
    if isinstance(search_results_folder, str):
        search_results_folder = Path(search_results_folder)
    if not search_results_folder.exists():
        search_results_folder.mkdir(parents=True)
    
    search_result_id = str(uuid.uuid4())
    file_to_save = search_results_folder / f'{search_result_id}.json'
    
    with open(file_to_save,'w') as actual_content_file:
        json.dump(content,actual_content_file,indent=2)

In [7]:
def save_item_details_to_file(content, item_details_folder='./search_results/items'):
        
    if isinstance(item_details_folder, str):
        item_details_folder = Path(item_details_folder)
    if not item_details_folder.exists():
        item_details_folder.mkdir(parents=True)
    
    item_id = content['id']
    file_to_save = item_details_folder / f'{item_id}.json'
    
    with open(file_to_save,'w') as actual_content_file:
        json.dump(content,actual_content_file,indent=2)

In [8]:
def search_cellphones(headers={},params={},search_results_folder='./search_results'):

    # endpoint for search for cellphones category
    url = 'https://api.mercadolibre.com/sites/MLB/search?category=MLB1055'
    
    # first page of results
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        content = response.content.decode(response.encoding)
        content =  json.loads(content)
        total_results = content['paging']['total']

        if len(content['results']) > 0:        
            save_content_to_file(content, search_results_folder)
    else:
        return None

    # calc next offsets
    if total_results >= 4000:
        max_offset = 4000
    else:
        max_offset = total_results

    # iterate through the next results pages
    for actual_offset in tqdm(range(50,max_offset,50),leave=False):
    
        params['offset'] = actual_offset
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 200:
            content = response.content.decode(response.encoding)
            content =  json.loads(content)
            if len(content['results']) > 0:        
                save_content_to_file(content,search_results_folder)

In [9]:
def search_item_details(item_id,item_details_folder='./search_results/items'):
    url = f'https://api.mercadolibre.com/items/{item_id}'
    response = requests.get(url)
    if response.status_code == 200:
        content = response.content.decode(response.encoding)
        content =  json.loads(content)
        save_item_details_to_file(content, item_details_folder=item_details_folder)    

In [10]:
def parse_search_results_file(search_result_file):
    
    parsed_results = []
       
    with open(search_result_file) as actual_result_file:
        actual_content = json.load(actual_result_file)
        actual_results = actual_content['results']
    
    for item in actual_results:
        keys_to_read = [
            'id', 
            'title', 
            'condition', 
            'catalog_product_id', 
            'permalink', 
            'category_id', 
            'price', 
            'available_quantity', 
            'order_backend',
            'official_store_id']
        actual_item = {key:item[key] for key in keys_to_read}

        if actual_item['official_store_id'] is None:
            actual_item['official_store_id'] = -1
        
        if 'seller' in item.keys():
            actual_item['seller_id'] = item['seller']['id']
            actual_item['seller_nickname'] = item['seller']['nickname']
        else:
            actual_item['seller_id'] = -1
            actual_item['seller_nickname'] = None

        actual_item['brand'] = None
        actual_item['model'] = None
    
        for attribute in item['attributes']:
            if attribute['id'] == 'BRAND':
                actual_item['brand'] = attribute['value_name']
            elif attribute['id'] == 'MODEL':
                actual_item['model'] = attribute['value_name']     
 
        parsed_results.append(actual_item)
           
    return parsed_results

In [27]:
def parse_item_details_file(item_details_file):
    keys_to_keep = [
        'id', 
        'title', 
        'seller_id', 
        'category_id', 
        'official_store_id', 
        'price', 
        'currency_id', 
        'initial_quantity',
        'condition', 
        'permalink', 
        'warranty', 
        'catalog_product_id', 
        'date_created', 
        'last_updated', 
        'status'
    ]
    
    attributes_to_keep = [
        'ITEM_CONDITION', 
        'BRAND', 
        'MODEL', 
        'DETAILED_MODEL', 
        'ANATEL_HOMOLOGATION_NUMBER', 
        'CELLPHONES_ANATEL_HOMOLOGATION_NUMBER', 
        'GTIN', 
        'EMPTY_GTIN_REASON',
    ]

    columns_to_keep = [col.lower() for col in keys_to_keep+attributes_to_keep]

    with open(item_details_file) as file:
        item_details = json.load(file)

    if item_details['status']=='under review':
        return None

    if not 'attributes' in item_details.keys():
        return None
    
    parsed_item = {key:None for key in columns_to_keep}    
    for key in keys_to_keep:
        try:
            parsed_item[key] = item_details[key]
        except:
            continue
    
    for item_attribute in item_details['attributes']:
        if item_attribute['id'] in attributes_to_keep:
            attribute_key = item_attribute['id'].lower()
            parsed_item[attribute_key]  = item_attribute['value_name']

    parsed_item['warranty_type'] = None
    if 'sale_terms' in item_details.keys():
        for sale_term in item_details['sale_terms']:
            if sale_term['id'] == 'WARRANTY_TYPE':
                parsed_item['warranty_type'] = sale_term['value_name']

    if parsed_item['anatel_homologation_number'] is not None:
        parsed_item['anatel_homologation_number'] = parsed_item['anatel_homologation_number'].zfill(12)

    if parsed_item['cellphones_anatel_homologation_number'] is not None:
        parsed_item['cellphones_anatel_homologation_number'] = parsed_item['cellphones_anatel_homologation_number'].zfill(12)

    return parsed_item

In [12]:
def parse_item_details_folder(item_details_folder):

    if isinstance(item_details_folder, str):
        item_details_folder = Path(item_details_folder)
    item_details_files = [file for file in item_details_folder.glob('*.json')]

    item_details = [parse_item_details_file(file) for file in tqdm(item_details_files)]
    item_details = [item for item in item_details if item is not None]

    return pd.DataFrame(item_details)

# Carga e prepação dos dados

## Autenticação

https://developers.mercadolivre.com.br/pt_br/autenticacao-e-autorizacao 

In [13]:
creds_file = Path(os.environ['USERPROFILE'],'creds.ini')
creds = configparser.ConfigParser()
creds.read(creds_file)  

client_id = creds['MERCADO_LIVRE']['client_id']
client_secret = creds['MERCADO_LIVRE']['client_secret']
actual_refresh_token = creds['MERCADO_LIVRE']['actual_refresh_token']
actual_access_token = creds['MERCADO_LIVRE']['actual_access_token']

actual_access_token, actual_refresh_token = get_new_access_token(actual_refresh_token,client_id,client_secret)
creds['MERCADO_LIVRE']['actual_access_token'] = actual_access_token
creds['MERCADO_LIVRE']['actual_refresh_token'] = actual_refresh_token
    
with open(creds_file, 'w') as file:
    creds.write(file)

headers = {
  'Authorization': f'Bearer {actual_access_token}'
}

# Pesquisa

## Pesquisa categoria *"Celulares e Smartphones"*

Pesquisa todos os produtos da categoria *"Celulares e Smartphones"*. A API pública do Mercado Livre limita a busca a 4000 itens. A partir dos resultados dessa pesquisa serão obtidos os modelos dos celulares encontrados (Marca+Modelo) para realizar novas buscas, uma para cada modelos.

A busca por modelo também é limitada, permite buscar apenas 1000 itens, mas, com várias buscas, poderão ser encontrados mais itens do que os 4000 iniciais da busca por categoria.

In [None]:
# search for all items in cellphones category
search_cellphones(headers=headers,search_results_folder=search_results_folder)        

In [None]:
# search for all items in cellphones category, offered by official stores
params = {'official_store': 'all'}
search_cellphones(headers=headers,params=params,search_results_folder=search_results_folder)

Lê os arquivos resultantes da pesquisa por categoria e consolida em um único dataframe para obter uma relação de fabricantes e modelos para ampliar a pesquisa

In [None]:
search_results_files = [file for file in search_results_folder.glob('*.json')]
category_search_results = []
for file in tqdm(search_results_files):
    category_search_results.extend(parse_search_results_file(file))
df_cellphones = pd.DataFrame(category_search_results)
df_cellphones = df_cellphones.drop_duplicates(subset='id',keep='last')
df_cellphones

Pesquisa por fabricante e modelo. Somente produtos novos.

In [None]:
# search for keyword (brand)
brands = df_cellphones.brand.unique()
for brand in tqdm(brands):
    params = {'q': brand, 'condition': 'new'}
    search_cellphones(headers=headers,params=params,search_results_folder=search_results_folder)

In [None]:
# search for keyword (model)
models = df_cellphones.model.unique()
for model in tqdm(models):
    params = {'q': model, 'condition': 'new'}
    search_cellphones(headers=headers,params=params,search_results_folder=search_results_folder)

## Pesquisa detalhes dos itens

Consolida os resultados das pesquisas anteriores e, na sequência, seleciona os identificadores dos anúncios para pesquisar os detalhes de cada um.

In [16]:
search_results_files = [file for file in search_results_folder.glob('*.json')]
category_search_results = []
for file in tqdm(search_results_files):
    category_search_results.extend(parse_search_results_file(file))
df_cellphones = pd.DataFrame(category_search_results)
df_cellphones = df_cellphones.drop_duplicates(subset='id',keep='last')
df_cellphones

  0%|          | 0/14698 [00:00<?, ?it/s]

Unnamed: 0,id,title,condition,catalog_product_id,permalink,category_id,price,available_quantity,order_backend,official_store_id,seller_id,seller_nickname,brand,model
672,MLB3617524633,Moto G3 - Lote Leia,used,MLB22753154,https://produto.mercadolivre.com.br/MLB-361752...,MLB1055,90.00,1,3,-1,357462626,NEWFRONTVARIEDADES,Motorola,Moto G3 4G
673,MLB4784421852,Moto E7 Tela 6.5'' 32gb 2gb Ram Motorola Cor C...,new,MLB16522381,https://produto.mercadolivre.com.br/MLB-478442...,MLB1055,488.00,1,4,-1,581283787,GPCELLTEC,Motorola,E7 Dual SIM
675,MLB3888107716,Celular Motorola Moto G Xt1039 8gb,used,MLB6005481,https://produto.mercadolivre.com.br/MLB-388810...,MLB1055,199.99,1,6,-1,246461445,LOJAATHUS,Motorola,G (1st Gen.)
676,MLB4623370702,Moto G 5g Dual Sim 128 Gb Prata-prisma 6 Gb Ram,used,MLB16232473,https://produto.mercadolivre.com.br/MLB-462337...,MLB1055,600.00,1,7,-1,1085096675,TORRIROBERTO20220306171008,Motorola,G 5G Dual SIM
678,MLB4218109494,"Celular Dual Chip Moto G8 Plus, Completo, Est...",used,MLB15273215,https://produto.mercadolivre.com.br/MLB-421810...,MLB1055,599.00,1,9,-1,90809615,J.Z.BERGER,Motorola,G8 Plus Dual SIM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607195,MLB3397806755,Vovô&vovófone 4g 32gb/1gb Ram Tela 5 Icones Gr...,new,MLB18562394,https://produto.mercadolivre.com.br/MLB-339780...,MLB1055,513.48,1,46,-1,170439607,GIGA_CELL,Positivo,S509
607196,MLB2604415443,Smartphone Para Idosos Vovôfone 16gb Redes Soc...,new,MLB19126228,https://produto.mercadolivre.com.br/MLB-260441...,MLB1055,389.00,1,47,-1,170439607,GIGA_CELL,Positivo,4 Mini S432
607197,MLB3264493750,Smartphone Positivo Swist 5 64gb Tela 6.26 Dua...,new,MLB22332640,https://produto.mercadolivre.com.br/MLB-326449...,MLB1055,449.00,1,48,-1,1160546124,MAGAZINEMETA,Positivo,TWIST 5 S620
607198,MLB3317050524,Smartphone Positivo Twist 4 Mini S432 16gb Preto,new,MLB19126228,https://produto.mercadolivre.com.br/MLB-331705...,MLB1055,339.00,1,49,-1,1160546124,MAGAZINEMETA,Positivo,S432


In [17]:
item_details = [file.stem for file in item_details_folder.glob('*.json')]

df_available_items = pd.DataFrame(item_details,columns=['id'])
df_available_items['details_avilable'] = 1

df_items_to_search = pd.merge(df_cellphones[['id','catalog_product_id']],df_available_items,how='left')
df_items_to_search = df_items_to_search[df_items_to_search['details_avilable'].isna()]

items_to_search = df_items_to_search.id.values

In [None]:
for item in tqdm(items_to_search):
    search_item_details(item,item_details_folder=item_details_folder)

## Consolidação e análise

In [33]:
%%time
df_cellphones = parse_item_details_folder(item_details_folder)

# df_cellphones['date_created'] = pd.to_datetime(df_cellphones['date_created'])
# df_cellphones['last_updated'] = pd.to_datetime(df_cellphones['last_updated'])
# df_cellphones['year_created'] = df_cellphones['date_created'].dt.year
# df_cellphones['year_updated'] = df_cellphones['last_updated'].dt.year

# df_cellphones['has_gtin'] = ~df_cellphones.gtin.isna()

cellphones_file = DATASETS_ROOT_PATH / 'cellphones.parquet'
df_cellphones.to_parquet(cellphones_file)
df_cellphones

  0%|          | 0/22892 [00:00<?, ?it/s]

CPU times: total: 1.88 s
Wall time: 37.7 s


Unnamed: 0,id,title,seller_id,category_id,official_store_id,price,currency_id,initial_quantity,condition,permalink,...,status,item_condition,brand,model,detailed_model,anatel_homologation_number,cellphones_anatel_homologation_number,gtin,empty_gtin_reason,warranty_type
0,MLB1013554239,Nokia 208 3.5 G Caixa Novo.,65299965,MLB1055,,399.0,BRL,6,used,https://produto.mercadolivre.com.br/MLB-101355...,...,active,Usado,Nokia,208,,,031351301547,,,
1,MLB1015206425,Motorola I418 Nextel Tecnologia Iden Ptt Radio...,95803984,MLB1055,,150.0,BRL,1,used,https://produto.mercadolivre.com.br/MLB-101520...,...,active,Usado,Motorola,i418,,7892597986057,023391200502,,,Sem garantia
2,MLB1021084484,Nokia C2 01 3g Semi Novos.,65299965,MLB1055,,650.0,BRL,1,used,https://produto.mercadolivre.com.br/MLB-102108...,...,active,Usado,Nokia,C2-01,,,029441001547,,,
3,MLB1022268007,"Celular Multilaser Up Dual Chip, C/ Camera, Mp...",219342792,MLB1055,,109.9,BRL,1,new,https://produto.mercadolivre.com.br/MLB-102226...,...,active,Novo,Multilaser,Up Dual,P3293,,014921203111,7898506472918,,
4,MLB1022545382,Celular Nokia 7370 Desbloqueado,72260221,MLB1055,,499.0,BRL,1,used,https://produto.mercadolivre.com.br/MLB-102254...,...,active,Usado,Nokia,7370,,023654789456,004110602388,,,Garantia do vendedor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22751,MLB962599671,Celular Nokia 7373 Desbloqueado,72260221,MLB1055,,499.0,BRL,2,used,https://produto.mercadolivre.com.br/MLB-962599...,...,active,Usado,Nokia,7373,,,016030602388,,,Garantia do vendedor
22752,MLB964944285,Celular De Coleção Desbloqueado Nokia 3650,72260221,MLB1055,,699.0,BRL,1,used,https://produto.mercadolivre.com.br/MLB-964944...,...,active,Usado,Nokia,3650,848958030317,,003160301919,,,Garantia do vendedor
22753,MLB983811525,Celular Flip Nokia C2-05 C2 05,72260221,MLB1055,,399.0,BRL,1,used,https://produto.mercadolivre.com.br/MLB-983811...,...,active,Usado,Nokia,C2-05,,,022911101547,,,Garantia do vendedor
22754,MLB987427445,"iPhone X 256 Gigas Aproveite Super Desconto, ...",101072492,MLB1055,,4990.0,BRL,2,new,https://produto.mercadolivre.com.br/MLB-987427...,...,active,Novo,Apple,iPhone X,,,051471701993,,,


In [None]:
# df_cellphones = pd.DataFrame(parsed_results)
# df_cellphones.to_parquet('cellphone.parquet')

# columns_to_keep = ['id', 'permalink', 'title','seller_nickname', 'brand', 'model', 'gtin', 'cellphones_anatel_homologation_number']
# df_analise = df_cellphones[columns_to_keep]

# columns_to_keep = ['id', 'permalink', 'title', 'seller_nickname', 'brand', 'model', 'ean_anuncio', 'sch_anuncio']
# df_analise.columns = columns_to_keep

# df_analise['ean_anuncio'] = df_analise['ean_anuncio'].str.split(',')
# df_analise = df_analise.explode('ean_anuncio')

# # df_analise['ean_presente'] = df_analise['ean_mp'].apply(lambda x:  0 if x is None else 1)
# df_analise['sch_presente'] = df_analise['sch_anuncio'].apply(lambda x:  0 if x is None else 1)

# df_analise['sch_anuncio'] = df_analise['sch_anuncio'].fillna('0').str.zfill(12)
# df_analise['ean_anuncio'] = df_analise['ean_anuncio'].fillna('0').str.zfill(13)

# df_analise = df_analise.merge(df_ean_sch,left_on='ean_anuncio', right_on='ean_sch', how='left')
# df_analise['ean_ok'] = df_analise['ean_anuncio']==df_analise['ean_sch']
# df_analise['sch_ok'] = df_analise['sch_anuncio']==df_analise['sch_sch']
# df_analise['ean_sch_ok'] = df_analise[['ean_ok', 'sch_ok']].apply(lambda row: all(row),axis=1)
# columns_to_keep = ['id', 'permalink', 'title', 'seller_nickname', 'brand', 'model', 'ean_anuncio', 'sch_anuncio', 'sch_presente', 'ean_sch', 'sch_sch', 'ean_sch_ok']
# df_analise = df_analise[columns_to_keep]


# df_analise = df_analise.merge(df_sch_models, left_on='sch_anuncio', right_on='Número de Homologação',how='left')
# df_analise['sch_valido'] = df_analise['Número de Homologação'].apply(lambda x:  0 if pd.isna(x) else 1)
# columns_to_keep = ['id', 'permalink', 'title', 'seller_nickname', 'brand', 'model', 'ean_anuncio', 'sch_anuncio', 'sch_presente', 'ean_sch', 'sch_sch', 'ean_sch_ok', 'sch_valido', 'Nome do Fabricante', 'Modelo Completo']
# df_analise = df_analise[columns_to_keep]

# df_analise