In [202]:
import configparser
import os.path as osp
import pandas as pd
import re
import requests
import string

from collections import Counter
from tqdm.auto import tqdm

In [203]:
def clean_models(models: list) -> set:
    models = [model.lower() for model in df_ean['Descrição'].unique()]
    stop_words = ['+',  '120hz', '2', '4g', '5g', 'amarelo', 'azul',
                  'basic', 'bluetooth', 'br', 'branco', 'cinza', 'de',
                  'earbuds', 'earphone', 'earphones', 'escuro', 'fone',
                  'gradiente', 'inteligente', 'laranja', 'mi', 'prata',
                  'preto', 'pulseira', 'rosa', 'roxo', 'tela', 'true',
                  'verde', 'wireless', 'wireless']

    pattern = '(([\d]+gb)?[\d]+gb)|(\([\d\w-]+\))|(\d+,\d+)|([+"\/\',’])'
    models = [re.sub(pattern,'',model) for model in models]
    models = set([' '.join([token for token in model.split() if token not in stop_words]) for model in models])
    return models

In [205]:
ean_xiaomi_file = 'datasets/ean_xiaomi.xlsx'
dtype={'EAN': 'str'}
df_ean_xiaomi = pd.read_excel(ean_xiaomi_file,dtype=dtype)
df_ean_xiaomi.head()

Unnamed: 0,Código. Referencia,Descrição,EAN
0,CX298VRD,"Smartphone Xiaomi Redmi 9A Tela 6,53"" 2GB/32GB...",7898567779018
1,CX298AZU,"Smartphone Xiaomi Redmi 9A Tela 6,53"" 2GB/32GB...",7898567778998
2,CX298CIN,"Smartphone Xiaomi Redmi 9A Tela 6,53"" 2GB/32GB...",7898567779001
3,CX297VRD,"Smartphone Xiaomi Redmi 9 Tela 6,53"" 4GB/64GB ...",7898567778875
4,CX297ROX,"Smartphone Xiaomi Redmi 9 Tela 6,53"" 4GB/64GB ...",7898567778868


In [212]:
google_search_results_file = 'datasets/google_search_results.parquet'
if osp.isfile(google_search_results_file):
    df_google_search_results = pd.read_parquet(google_search_results_file)
    previous_searched_items = df_google_search_results['originalQuery'].unique()
    has_previous_searched_items = True
    print('Loaded {} previous searched items'.format(len(previous_searched_items)))
else: 
    has_previous_searched_items = False 

Loaded 51 previous searched items


In [206]:
creds_file = 'creds.ini'
creds = configparser.ConfigParser()
creds.read(creds_file)
google_search_api_key = creds['GOOGLE_SEARCH']['google_search_api_key']
google_search_engine_id = creds['GOOGLE_SEARCH']['google_search_engine_id']
google_search_endpoint = creds['GOOGLE_SEARCH']['google_search_endpoint']

results = []

In [213]:
items_to_query = list(df_ean_xiaomi['EAN'].unique())
items_to_query.extend(df_ean_xiaomi['Código. Referencia'])
items_to_query.extend(clean_models(df_ean_xiaomi['Descrição']))

if has_previous_searched_items:
    items_to_query = [item for item in items_to_query if not item in previous_searched_items]

print('{} items left to query'.format(len(items_to_query)))

453 items left to query


In [225]:

for ean in tqdm(items_to_query[:10]):
    params = {'q': ean,
              'key': google_search_api_key, 
              'cx': google_search_engine_id, 
              'count': 50,
              'cr': 'countryBR', 
              'lr': 'lang_pt'}
    
    response = requests.get(google_search_endpoint, params=params)
    try:
        response.raise_for_status()
        search_results = response.json()
        originalQuery = search_results['queries']['request'][0]['searchTerms']
        totalResults = int(search_results['searchInformation']['totalResults'])
    
        if totalResults > 0:
            for item in search_results['items']:
                name = item['title']
                url = item['link']
                if 'snippet' in item.keys():
                    snippet = item['snippet']
                else:
                    snippet = None
                
                results.append({'originalQuery': originalQuery,
                                'name': name,
                                'url': url,
                                'snippet': snippet})
        else:
            results.append({'originalQuery': originalQuery,                     
                            'name': None,
                            'url': None,
                            'snippet': None})
    except Exception as ex:
        if response.status_code == 429:
            print('Daily quota exceeded:')
            print(ex)
            break
        else:
            continue

  0%|          | 0/10 [00:00<?, ?it/s]

Daily quota exceeded:
429 Client Error: Too Many Requests for url: https://www.googleapis.com/customsearch/v1?q=7908426300403&key=AIzaSyCXVt1-s9joH4wAQUzormqnjfUs3ip9C5o&cx=44647ca1e026c4237&count=50&cr=countryBR&lr=lang_pt


NameError: name 'ex' is not defined

In [226]:
if has_previous_searched_items:
    df_google_search_results = pd.concat([df_google_search_results,pd.DataFrame(results)])
    df_google_search_results = df_google_search_results.drop_duplicates(subset='url').reset_index(drop=True)
else:
    df_google_search_results = pd.DataFrame(results)
df_google_search_results.to_parquet(google_search_results_file)
df_google_search_results

Unnamed: 0,originalQuery,name,url,snippet
0,7908426307501,Celular Xiaomi Redmi 12C 3GB de RAM / 64GB / T...,https://www.claraeletro.com.br/produto/celular...,SKU 7908426307501. Marca: Xiaomi. Avalie. 1. C...
1,7908426307501,Smartphone Xiaomi Redmi 12C 64GB Azul Tela 6.7...,https://www.bemol.com.br/smartphone-xiaomi-red...,Dimensões do Produto (AxLxP) 0.6 x 10 x 1.8 cm...
2,6941812706145,Smartphone Xiaomi Mi 13 Lite 8gb ram 256gb azu...,https://www.workfastbrasil.com.br/smartphones/...,L9S. EAN ?6941812706145. Dimensões do produto ...
3,6941812706145,Promoção! Smartphone Xiaomi 13 Lite Dual SIM d...,https://www.oiplace.com.br/smartphone-xiaomi-1...,6941812706145. Cor. Azul. EAN. 6941812706145. ...
4,7908426306986,Smartphone Xiaomi Redmi Note 12 5G 128GB Verde...,https://www.bemol.com.br/smartphone-xiaomi-red...,EAN:7908426306986. Aviso:Imagens meramente ilu...
...,...,...,...,...
123,7908426305200,XIAOMI SMARTPHONE REDMI 10A 4GB RAM 128GB BLUE...,https://cosmos.bluesoft.com.br/produtos/790842...,GTIN/EAN: 7908426305200. 7908426305200 - XIAOM...
124,7908426305200,Smartphone Xiaomi Readmi 10 + Inface CX342 128...,https://www.americanas.com.br/produto/58582158...,"7908426305200. Com tela dobrada secundária, Nã..."
125,7908426305200,Smartphone Xiaomi Redmi 10A 128GB Azul Tela 6....,https://www.bemol.com.br/smartphone-xiaomi-red...,EAN:7908426305200. Aviso:Imagens meramente ilu...
126,7908426305200,Celular Xiaomi Redmi 10A 128GB Dual - CX342AZU...,https://www.aquiquetem.com.br/celular-xiaomi-r...,REF: 7908426305200. Celular Xiaomi Redmi 10A 1...
