In [None]:
import configparser
import pandas as pd
import re
import requests
import string

from collections import Counter
from tqdm.auto import tqdm

In [None]:
ean_file = 'datasets/ean_xiaomi.xlsx'
df_ean = pd.read_excel(ean_file)
df_ean

In [None]:
def clean_models(models: list) -> set:
    models = [model.lower() for model in df_ean['Descrição'].unique()]
    stop_words = ['+',  '120hz', '2', '4g', '5g', 'amarelo', 'azul',
                  'basic', 'bluetooth', 'br', 'branco', 'cinza', 'de',
                  'earbuds', 'earphone', 'earphones', 'escuro', 'fone',
                  'gradiente', 'inteligente', 'laranja', 'mi', 'prata',
                  'preto', 'pulseira', 'rosa', 'roxo', 'tela', 'true',
                  'verde', 'wireless', 'wireless']

    pattern = '(([\d]+gb)?[\d]+gb)|(\([\d\w-]+\))|(\d+,\d+)|([+"\/\',’])'
    models = [re.sub(pattern,'',model) for model in models]
    models = set([' '.join([token for token in model.split() if token not in stop_words]) for model in models])
    return models

In [None]:
creds_file = 'creds.ini'
creds = configparser.ConfigParser()
creds.read(creds_file)
bing_search_api_key = creds['BING_SEARCH']['bing_search_api_key']
bing_search_endpoint = creds['BING_SEARCH']['bing_search_endpoint']

In [None]:
def bing_search(search_term):
    headers = {'Ocp-Apim-Subscription-Key': bing_search_api_key}
    params = {'q': search_term,
              # A 2-character country code of the country where the results come from.
              'cc': 'BR',
              # The number of search results to return in the response. 
              # The default is 10 and the maximum value is 50. 
              # he actual number delivered may be less than requested.
              'count': 50,
              # The market where the results come from.
              'mkt': 'pt-BR',
              # A comma-delimited list of answers to include in the response.
              'responseFilter': 'Webpages',
              }
    response = requests.get(bing_search_endpoint, headers=headers, params=params)
    try:
        response.raise_for_status()
        search_results = response.json()
        
        originalQuery = search_results['queryContext']['originalQuery']
    
        if search_results['rankingResponse'] and 'webPages' in search_results.keys():
            results = []
            for result in search_results['webPages']['value']:
                results.append({'originalQuery': originalQuery,
                                'name': result['name'],
                                'url': result['url'],
                                'snippet': result['snippet']})
            return results
    
        else:
            return [{'originalQuery': originalQuery,
                     'name': None,
                     'url': None,
                     'snippet': None}]
    except Exception as ex:
        return None
        

In [None]:
search_results = []
for ean in tqdm(df_ean['EAN'].unique()):
    result = bing_search(ean)
    if result: 
        search_results.extend(result)    

In [None]:
search_results

In [None]:
cleaned_models = clean_models(df_ean['Descrição'])
for model in tqdm(cleaned_models):
    result = bing_search(model)
    if result: 
        search_results.extend(result) 

In [None]:
for codigo_referencia in tqdm(df_ean['Código. Referencia']):
    result = bing_search(codigo_referencia)
    if result: 
        search_results.extend(result)

In [None]:
df_search_results = pd.DataFrame(search_results)
df_search_results['site'] = df_search_results['url'].apply(lambda url: re.search('\w+(?=\.com)',url).group() if not pd.isna(url) and re.search('\w+(?=\.com)',url) else None)
df_search_results['country'] = df_search_results['url'].apply(lambda url: re.search('(?<=\.)\w{2}(?=\/)',url).group() if not pd.isna(url) and re.search('(?<=\.)\w{2}(?=\/)',url) else None)
df_search_results.to_csv('datasets/bing_search_results.csv',index=False,sep='|')
df_search_results.to_csv('datasets/bing_search_results.csv.zip',index=False,sep='|')
df_search_results

In [None]:
df_search_results = df_search_results.drop_duplicates(subset='url')
df_search_results

In [None]:
df_search_results