# Bibliotecas e preparação do ambiente

In [67]:
import configparser
import json
import os
import pandas as pd
import re
import requests
import string
import uuid


from collections import Counter
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
from unicodedata import normalize
from urllib.parse import urlparse

In [74]:
# credentials file location
CREDS_FILE = r'C:\Users\maxwelfreitas\creds.ini'
# time format for json result files
RESULT_TS_FORMAT = '%Y%m%d%H%M%S%f'
# time format for wordcloud files
OUTPUT_TS_FORMAT = '%d/%m/%Y %H:%M:%S'
# sites to ignore results
BLACK_LISTED_SITES = ['gov.br', 'fccid.io', 'wer-hat-angerufen.info']
#output folders
SEARCH_RESULTS_FOLDER = 'datasets/searchresults'
ANNOTATITIONS_FOLDER = 'datasets/annotations'

In [3]:
def parse_site(url):
    return '.'.join(urlparse(url).netloc.split('.')[-3:])

In [4]:
def is_black_listed_site(url,black_listed_sites=BLACK_LISTED_SITES):
    for site in black_listed_sites:
        if site in url:
            return True
    return False

In [49]:
def tokenizer(doc,normalize_words=False):

    stop_words = stopwords.words('portuguese')
    stop_words.extend(stopwords.words('english'))
    stop_words.extend(list(string.punctuation))

    # stopwords específicas do domínio
    stop_words.extend(['cm', 'feature', 'features', 'informações', 'itens', 'leve', 'list', 'nulo', 'package', 
                       'pacote', 'pacotes', 'recurso', 'tamanho', 'ver', 'anatel', 'laranja', '.', '...',
                       'complementares', 'peça'])

    if normalize_words:
        doc = doc.lower()
        doc = normalize('NFKD', doc).encode('ASCII', 'ignore').decode('ASCII')

    # CountVectorizer token pattern
    pattern = r'\b\w\w+\b'
    tokens = [token for token in re.findall(pattern,doc) if token.lower() not in stop_words]
    # remove number-only tokens
    tokens = [token for token in tokens if not re.match('\d+',token)]

    # NLTK tokenizer
    # tokens = [token for token in word_tokenize(doc) if token.lower() not in stop_words]

    return tokens


In [15]:
# file should be a Path object
def parse_result_file(file, nWords=25):

    search_date, search_engine, search_term, _ = re.split('[_.]',file.name)
    search_site = None
    
    with open(file) as f:
        results = json.load(f)
        lines = []
        if search_engine == 'GOOGLE':
            # results without items in keys are empty
            if 'items' in results.keys():
                for item in results['items']:
                    search_site = item['displayLink']
                    if is_black_listed_site(search_site):
                        continue
                    else:
                        search_site = parse_site(search_site)
                        lines.append(item['title'])
                        if 'snippet' in item.keys():
                            lines.append(item['snippet'])
                        
        elif search_engine == 'BING':
            # results without webPages in keys are empty
            if 'webPages' in results.keys():
                for item in results['webPages']['value'][:10]:
                    search_site = item['url']
                    if is_black_listed_site(search_site):
                        continue
                    else:
                        search_site = parse_site(search_site)
                        lines.append(item['name'])
                        if 'snippet' in item.keys():
                            lines.append(item['snippet'])
                    
        if len(lines) >= 1:
            words = tokenizer(' '.join(lines))
            words_conter = Counter(words)
            wordCloud_dict = {key:value for key,value in words_conter.most_common(nWords)}
            wordCloud_json = json.dumps(wordCloud_dict, ensure_ascii=False)
        else:
            wordCloud_json = ''
        
        wourdCloudInfo_dict = {
            'metaData': {
                'Version': 1,
                'Source': search_engine,
                'Mode': 'API',
                'Fields': ['Name', 'Snippet'],
                'nWords': nWords
            },
            'searchedWord': search_term,
            'cloudOfWords': wordCloud_json
        }
    
        wourdCloudInfo_json = json.dumps(wourdCloudInfo_dict, ensure_ascii=False)
    
        return {'ID': str(uuid.uuid4()),
                'DataHora': datetime.strptime(search_date,RESULT_TS_FORMAT).strftime(OUTPUT_TS_FORMAT),
                'Computador': os.environ['COMPUTERNAME'],
                'Usuário': 'E!', 
                'Homologação': f'{search_term[:5]}-{search_term[5:7]}-{search_term[-5:]}', 
                'Atributo': 'WordCloud',
                'Valor': wourdCloudInfo_json,               
                'Buscadora': search_engine}

In [98]:
def load_creds(creds_file=CREDS_FILE):
    creds = configparser.ConfigParser()
    creds.read(creds_file)
    
    bing_search_api_key = creds['BING_SEARCH']['bing_search_api_key']
    bing_search_endpoint = creds['BING_SEARCH']['bing_search_endpoint']
    google_search_api_key = creds['GOOGLE_SEARCH']['google_search_api_key']
    google_search_engine_id = creds['GOOGLE_SEARCH']['google_search_engine_id']
    google_search_endpoint = creds['GOOGLE_SEARCH']['google_search_endpoint']
    
    creds = {'BING': {'bing_search_api_key': bing_search_api_key, 
                      'bing_search_endpoint': bing_search_endpoint },
             'GOOGLE': {'google_search_api_key': google_search_api_key, 
                        'google_search_engine_id': google_search_engine_id, 
                        'google_search_endpoint': google_search_endpoint }}
    return creds

In [107]:
def save_results_file(search_results,search_engine):
    result_ts = datetime.now().strftime(RESULT_TS_FORMAT)
    results_filename = f'{result_ts}_{search_engine}_{search_term}.json'
    file_to_save = Path(SEARCH_RESULTS_FOLDER,results_filename)
    with open(file_to_save, 'w') as f:
        json.dump(search_results,f, indent=2)    

In [95]:
def google_search(search_term,creds=creds):
    # load credentials
    google_search_api_key = creds['GOOGLE']['google_search_api_key']
    google_search_engine_id = creds['GOOGLE']['google_search_engine_id']
    google_search_endpoint = creds['GOOGLE']['google_search_endpoint']
    # search params
    params = {'q': search_term,
              'key': google_search_api_key, 
              'cx': google_search_engine_id, 
              'count': 50,
              'cr': 'countryBR', 
              'lr': 'lang_pt'}
    # execute query
    try:
        response = requests.get(google_search_endpoint, params=params)
        response.raise_for_status()
        query_raw_results = response.json()
        save_results_file(query_raw_results,'GOOGLE')
    except Exception as ex:
        raise ex
        
    return response.status_code  

In [108]:
def bing_search(search_term,creds=creds):
    # load credentials
    bing_search_api_key = creds['BING']['bing_search_api_key']
    bing_search_endpoint = creds['BING']['bing_search_endpoint']
    # search params
    headers = {'Ocp-Apim-Subscription-Key': bing_search_api_key}
    params = {'q': search_term,
              # A 2-character country code of the country where the results come from.
              'cc': 'BR',
              # The number of search results to return in the response. 
              # The default is 10 and the maximum value is 50. 
              # he actual number delivered may be less than requested.
              'count': 50,
              # The market where the results come from.
              'mkt': 'pt-BR',
              # A comma-delimited list of answers to include in the response.
              'responseFilter': 'Webpages',
             }
    # execute query
    try:
        response = requests.get(bing_search_endpoint, headers=headers, params=params)
        response.raise_for_status()
        query_raw_results = response.json()
        save_results_file(query_raw_results,'BING')
    except Exception as ex:
        raise ex
        
    return response.status_code  
    return

# Pesquisa

Nessa etapa serão feitas as pesquisas no Bing e no Google. Os resultados de cada consulta serão armazenados em um arquivo .json contendo a resposta bruta da consulta. 

Os arquivos serão armazenados na pasta *datasets/searchresults* com o seguinte formato de nome: `{data/hora da consulta}\_{mecanismo de busca}\_{nº do certificado de homologação}.json`, onde:
- data/hora da consulta: data e hora que a consulta foi realizada, no formato `%Y%m%d%H%M%S%f`
- mecanismo de busca: `GOOGLE` ou `BING`
- nº do certificado de homologação: número do certificado de homologação (apenas números)

In [109]:
bing_search('037242214637')

200

# Tratamento dos dados

Nessa etapa cada arquivo será lido e dele extraído o nome ou título e o resumo (*snippet*) da resposta para consolidar e criar a wordcloud

In [102]:
results_folder = Path('datasets/searchresults')
results_files = [file for file in results_folder.glob('*.json')]
df = pd.DataFrame([parse_result_file(file) for file in results_files])
df.iloc[:,:-1].to_excel('wordcloud.xlsx',index=False)
df

Unnamed: 0,ID,DataHora,Computador,Usuário,Homologação,Atributo,Valor,Buscadora
0,c5e83e4e-243b-4325-8dfe-fc40c1c07dd7,23/02/2024 15:01:29,ES6927559DTL,E!,03724-22-14637,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
1,5f43bb0c-62ad-4f40-a55f-0c2b761c0a41,23/02/2024 15:01:30,ES6927559DTL,E!,02035-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
2,44311a1b-99c7-4465-8490-cc0756d2dc43,23/02/2024 15:01:31,ES6927559DTL,E!,02018-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
3,e59c98ab-bd35-434e-86cd-a18a2604aabf,23/02/2024 15:01:31,ES6927559DTL,E!,06618-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
4,606f4af2-49d8-483b-afae-12c4b8d9456f,23/02/2024 15:01:32,ES6927559DTL,E!,12303-20-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
...,...,...,...,...,...,...,...,...
2175,df77d3a9-3dab-4866-8c24-613f0b8e5c9b,01/03/2024 12:17:56,ES6927559DTL,E!,00108-15-01699,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""BING"", ...",BING
2176,69cd29d3-310c-4d04-b9e7-5f72e9c59f93,01/03/2024 12:17:57,ES6927559DTL,E!,04886-16-01086,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""BING"", ...",BING
2177,92877972-d703-46d1-86ec-d94819d9c2a1,01/03/2024 12:17:58,ES6927559DTL,E!,05311-16-01138,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""BING"", ...",BING
2178,f948be1a-8d37-4816-844b-5f6be1542caf,22/05/2024 10:27:11,ES6927559DTL,E!,03724-22-14637,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
