In [None]:
import configparser
import os.path as osp
import pandas as pd
import re
import requests
import string

from collections import Counter
from tqdm.auto import tqdm

In [None]:
def clean_models(models: list) -> set:
    models = [model.lower() for model in df_ean['Descrição'].unique()]
    stop_words = ['+',  '120hz', '2', '4g', '5g', 'amarelo', 'azul',
                  'basic', 'bluetooth', 'br', 'branco', 'cinza', 'de',
                  'earbuds', 'earphone', 'earphones', 'escuro', 'fone',
                  'gradiente', 'inteligente', 'laranja', 'mi', 'prata',
                  'preto', 'pulseira', 'rosa', 'roxo', 'tela', 'true',
                  'verde', 'wireless', 'wireless']

    pattern = '(([\d]+gb)?[\d]+gb)|(\([\d\w-]+\))|(\d+,\d+)|([+"\/\',’])'
    models = [re.sub(pattern,'',model) for model in models]
    models = set([' '.join([token for token in model.split() if token not in stop_words]) for model in models])
    return models

In [None]:
ean_xiaomi_file = 'datasets/ean_xiaomi.xlsx'
dtype={'EAN': 'str'}
df_ean_xiaomi = pd.read_excel(ean_xiaomi_file,dtype=dtype)
df_ean_xiaomi.head()

In [None]:
google_search_results_file = 'datasets/google_search_results.parquet'
if osp.isfile(google_search_results_file):
    df_google_search_results = pd.read_parquet(google_search_results_file)
    previous_searched_items = df_google_search_results['originalQuery'].unique()
    has_previous_searched_items = True
    print('Loaded {} previous searched items'.format(len(previous_searched_items)))
else: 
    has_previous_searched_items = False 

In [None]:
creds_file = 'creds.ini'
creds = configparser.ConfigParser()
creds.read(creds_file)
google_search_api_key = creds['GOOGLE_SEARCH']['google_search_api_key']
google_search_engine_id = creds['GOOGLE_SEARCH']['google_search_engine_id']
google_search_endpoint = creds['GOOGLE_SEARCH']['google_search_endpoint']

results = []

In [None]:
items_to_query = list(df_ean_xiaomi['EAN'].unique())
items_to_query.extend(df_ean_xiaomi['Código. Referencia'])
items_to_query.extend(clean_models(df_ean_xiaomi['Descrição']))

if has_previous_searched_items:
    items_to_query = [item for item in items_to_query if not item in previous_searched_items]

print('{} items left to query'.format(len(items_to_query)))

In [None]:

for ean in tqdm(items_to_query[:10]):
    params = {'q': ean,
              'key': google_search_api_key, 
              'cx': google_search_engine_id, 
              'count': 50,
              'cr': 'countryBR', 
              'lr': 'lang_pt'}
    
    response = requests.get(google_search_endpoint, params=params)
    try:
        response.raise_for_status()
        search_results = response.json()
        originalQuery = search_results['queries']['request'][0]['searchTerms']
        totalResults = int(search_results['searchInformation']['totalResults'])
    
        if totalResults > 0:
            for item in search_results['items']:
                name = item['title']
                url = item['link']
                if 'snippet' in item.keys():
                    snippet = item['snippet']
                else:
                    snippet = None
                
                results.append({'originalQuery': originalQuery,
                                'name': name,
                                'url': url,
                                'snippet': snippet})
        else:
            results.append({'originalQuery': originalQuery,                     
                            'name': None,
                            'url': None,
                            'snippet': None})
    except Exception as ex:
        if response.status_code == 429:
            print('Daily quota exceeded:')
            print(ex)
            break
        else:
            continue

In [None]:
if has_previous_searched_items:
    df_google_search_results = pd.concat([df_google_search_results,pd.DataFrame(results)])
    df_google_search_results = df_google_search_results.drop_duplicates(subset='url').reset_index(drop=True)
else:
    df_google_search_results = pd.DataFrame(results)
df_google_search_results.to_parquet(google_search_results_file)
df_google_search_results