In [310]:
import configparser
import pandas as pd
import re
import requests
import string

from collections import Counter
from tqdm.auto import tqdm

In [372]:
ean_file = 'datasets/ean_xiaomi.xlsx'
df_ean = pd.read_excel(ean_file)
df_ean

Unnamed: 0,Código. Referencia,Descrição,EAN
0,CX298VRD,"Smartphone Xiaomi Redmi 9A Tela 6,53"" 2GB/32GB...",7898567779018
1,CX298AZU,"Smartphone Xiaomi Redmi 9A Tela 6,53"" 2GB/32GB...",7898567778998
2,CX298CIN,"Smartphone Xiaomi Redmi 9A Tela 6,53"" 2GB/32GB...",7898567779001
3,CX297VRD,"Smartphone Xiaomi Redmi 9 Tela 6,53"" 4GB/64GB ...",7898567778875
4,CX297ROX,"Smartphone Xiaomi Redmi 9 Tela 6,53"" 4GB/64GB ...",7898567778868
...,...,...,...
230,CMB370CIN,Smartphone Xiaomi Redmi Note 12 4GB+128GB Cinz...,7908426308294
231,CMB370VRD,Smartphone Xiaomi Redmi Note 12 4GB+128GB Verd...,7908426308300
232,CMB347AMA,Smartphone POCO M4 5G 6GB+128GB Amarelo + Fone...,7908426308942
233,CMB347AZU,Smartphone POCO M4 5G 6GB+128GB Azul + Fone Bl...,7908426308959


In [366]:
def clean_models(models: list) -> set:
    models = [model.lower() for model in df_ean['Descrição'].unique()]
    stop_words = ['+',  '120hz', '2', '4g', '5g', 'amarelo', 'azul',
                  'basic', 'bluetooth', 'br', 'branco', 'cinza', 'de',
                  'earbuds', 'earphone', 'earphones', 'escuro', 'fone',
                  'gradiente', 'inteligente', 'laranja', 'mi', 'prata',
                  'preto', 'pulseira', 'rosa', 'roxo', 'tela', 'true',
                  'verde', 'wireless', 'wireless']

    pattern = '(([\d]+gb)?[\d]+gb)|(\([\d\w-]+\))|(\d+,\d+)|([+"\/\',’])'
    models = [re.sub(pattern,'',model) for model in models]
    models = set([' '.join([token for token in model.split() if token not in stop_words]) for model in models])
    return models

In [17]:
creds_file = 'creds.ini'
creds = configparser.ConfigParser()
creds.read(creds_file)
bing_search_api_key = creds['BING_SEARCH']['bing_search_api_key']
bing_search_endpoint = creds['BING_SEARCH']['bing_search_endpoint']

In [255]:
def bing_search(search_term):
    headers = {'Ocp-Apim-Subscription-Key': bing_search_api_key}
    params = {'q': search_term,
              # A 2-character country code of the country where the results come from.
              'cc': 'BR',
              # The number of search results to return in the response. 
              # The default is 10 and the maximum value is 50. 
              # he actual number delivered may be less than requested.
              'count': 50,
              # The market where the results come from.
              'mkt': 'pt-BR',
              # A comma-delimited list of answers to include in the response.
              'responseFilter': 'Webpages',
              }
    response = requests.get(bing_search_endpoint, headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    originalQuery = search_results['queryContext']['originalQuery']

    if search_results['rankingResponse'] and 'webPages' in search_results.keys():
        results = []
        for result in search_results['webPages']['value']:
            results.append({'originalQuery': originalQuery,
                            'name': result['name'],
                            'url': result['url'],
                            'snippet': result['snippet']})
        return results

    else:
        return [{'originalQuery': originalQuery,
                 'name': None,
                 'url': None,
                 'snippet': None}]
        

In [368]:
search_results = []
for ean in tqdm(df_ean['EAN'].unique()):
    result = bing_search(ean)
    if result: 
        search_results.extend(result)    

  0%|          | 0/235 [00:00<?, ?it/s]

In [369]:
cleaned_models = clean_models(df_ean['Descrição'])
for model in tqdm(cleaned_models):
    result = bing_search(model)
    if result: 
        search_results.extend(result) 

  0%|          | 0/34 [00:00<?, ?it/s]

In [376]:
for codigo_referencia in tqdm(df_ean['Código. Referencia']):
    result = bing_search(codigo_referencia)
    if result: 
        search_results.extend(result)

HTTPError: 403 Client Error: Quota Exceeded for url: https://api.bing.microsoft.com/v7.0/search?q=CX355PIN&cc=BR&count=50&mkt=pt-BR&responseFilter=Webpages

In [377]:
df_search_results = pd.DataFrame(search_results)
df_search_results['site'] = df_search_results['url'].apply(lambda url: re.search('\w+(?=\.com)',url).group() if not pd.isna(url) and re.search('\w+(?=\.com)',url) else None)
df_search_results['country'] = df_search_results['url'].apply(lambda url: re.search('(?<=\.)\w{2}(?=\/)',url).group() if not pd.isna(url) and re.search('(?<=\.)\w{2}(?=\/)',url) else None)
df_search_results.to_csv('datasets/bing_search_results.csv',index=False,sep='|')
df_search_results

Unnamed: 0,originalQuery,name,url,snippet,site,country
0,7898567779018,Smartphone Xiaomi Redmi 9A Tela 6.53 32GB 13MP...,https://www.amazon.com.br/Smartphone-Xiaomi-Re...,"‎7898567779018 : Dimensões do produto ‎17,3 x ...",amazon,br
1,7898567779018,Smartphone Xiaomi Redmi 9A 32GB 4G Wi-Fi Tela ...,https://www.hipersat.com.br/item/smartphone-xi...,ficha técnica Código 4702070741 Código de barr...,hipersat,br
2,7898567779018,Smartphone Xiaomi Redmi 9A 32GB Verde Tela 6.5...,https://www.bemol.com.br/smartphone-xiaomi-red...,EAN 7898567779018; Aviso Imagens meramente ilu...,bemol,br
3,7898567779018,M&E Store atacado e dropshipping - Smartphone ...,https://www.mestoreatacado.com.br/smartphone-x...,GTIN/EAN: 7898567779018. Smartphone Xiaomi Red...,mestoreatacado,br
4,7898567779018,"Smartphone Xiaomi Redmi 9A, Verde, Tela 6.53"",...",https://www.martinsatacado.com.br/produto/smar...,Tente novamente mais tarde ou entre em contato...,martinsatacado,br
...,...,...,...,...,...,...
5318,CX352VRD,Technical Guidance | HPE Aruba Networking,https://www.arubanetworks.com/resources/techni...,IDC MarketScape again recognizes HPE Aruba Net...,arubanetworks,
5319,CX352VRD,Home | Validated Solution Guide - Aruba,https://www.arubanetworks.com/techdocs/VSG/,VSGs are cross-portfolio solution guides that ...,arubanetworks,
5320,CX352VRD,CX252 (CPA252) Cathay Pacific Flight Tracking ...,https://www.flightaware.com/live/flight/CPA252,arriving at Terminal 1 Hong Kong Int'l - HKG. ...,flightaware,
5321,CX352VRD,Modelo Recursos Interpostos,https://cetesb.sp.gov.br/biogas/wp-content/upl...,Tipo Número Situação Empreendimento Endereço C...,,br


In [378]:
df_search_results = df_search_results.drop_duplicates(subset='url')
df_search_results

Unnamed: 0,originalQuery,name,url,snippet,site,country
0,7898567779018,Smartphone Xiaomi Redmi 9A Tela 6.53 32GB 13MP...,https://www.amazon.com.br/Smartphone-Xiaomi-Re...,"‎7898567779018 : Dimensões do produto ‎17,3 x ...",amazon,br
1,7898567779018,Smartphone Xiaomi Redmi 9A 32GB 4G Wi-Fi Tela ...,https://www.hipersat.com.br/item/smartphone-xi...,ficha técnica Código 4702070741 Código de barr...,hipersat,br
2,7898567779018,Smartphone Xiaomi Redmi 9A 32GB Verde Tela 6.5...,https://www.bemol.com.br/smartphone-xiaomi-red...,EAN 7898567779018; Aviso Imagens meramente ilu...,bemol,br
3,7898567779018,M&E Store atacado e dropshipping - Smartphone ...,https://www.mestoreatacado.com.br/smartphone-x...,GTIN/EAN: 7898567779018. Smartphone Xiaomi Red...,mestoreatacado,br
4,7898567779018,"Smartphone Xiaomi Redmi 9A, Verde, Tela 6.53"",...",https://www.martinsatacado.com.br/produto/smar...,Tente novamente mais tarde ou entre em contato...,martinsatacado,br
...,...,...,...,...,...,...
5306,CX352VRD,Cremalheira mwm 2.8 - Mercado Livre,https://lista.mercadolivre.com.br/cremalheira-...,Volante Cremalheira Bimassa S10 Blazer 2.8 200...,mercadolivre,br
5312,CX352VRD,English subbed episodes collection - GameCente...,https://archive.org/details/game-center-cx-eng...,Game Center CX - English subbed episodes colle...,,
5315,CX352VRD,F-102 S.A. 1/72 - Eduard Store,https://www.eduard.com/Eduard/Photo-etched-par...,"F-102 S. A. 1/72. 14,95 US$. DISCONTINUED. Gal...",eduard,
5317,CX352VRD,352 Hydraulic Excavator | Cat | Caterpillar,https://www.cat.com/en_US/products/new/equipme...,Check out current offers for the 352. View Mor...,cat,


In [381]:
df_search_results.groupby('site').count().sort_values(by='url',ascending=False).head(20)

Unnamed: 0_level_0,originalQuery,name,url,snippet,country
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
magazineluiza,264,264,264,264,264
bemol,245,245,245,245,245
americanas,236,236,236,236,236
mercadolivre,199,199,199,199,199
amazon,168,168,168,168,166
armazemautomotivo,123,123,123,123,0
casasbahia,102,102,102,102,102
oiplace,92,92,92,92,92
shoptime,80,80,80,80,80
pontofrio,79,79,79,79,79
