# Bibliotecas e preparação do ambiente

In [12]:
import json
import os
import pandas as pd
import re
import string
import uuid


from collections import Counter
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
from unicodedata import normalize
from urllib.parse import urlparse

In [2]:
# time format for json result files
RESULT_TS_FORMAT = '%Y%m%d%H%M%S%f'
# time format for wordcloud files
OUTPUT_TS_FORMAT = '%d/%m/%Y %H:%M:%S'
# sites to ignore results
BLACK_LISTED_SITES = ['gov.br', 'fccid.io', 'wer-hat-angerufen.info']

In [3]:
def parse_site(url):
    return '.'.join(urlparse(url).netloc.split('.')[-3:])

In [4]:
def is_black_listed_site(url,black_listed_sites=BLACK_LISTED_SITES):
    for site in black_listed_sites:
        if site in url:
            return True
    return False

In [49]:
def tokenizer(doc,normalize_words=False):

    stop_words = stopwords.words('portuguese')
    stop_words.extend(stopwords.words('english'))
    stop_words.extend(list(string.punctuation))

    # stopwords específicas do domínio
    stop_words.extend(['cm', 'feature', 'features', 'informações', 'itens', 'leve', 'list', 'nulo', 'package', 
                       'pacote', 'pacotes', 'recurso', 'tamanho', 'ver', 'anatel', 'laranja', '.', '...',
                       'complementares', 'peça'])

    if normalize_words:
        doc = doc.lower()
        doc = normalize('NFKD', doc).encode('ASCII', 'ignore').decode('ASCII')

    # CountVectorizer token pattern
    pattern = r'\b\w\w+\b'
    tokens = [token for token in re.findall(pattern,doc) if token.lower() not in stop_words]
    # remove number-only tokens
    tokens = [token for token in tokens if not re.match('\d+',token)]

    # NLTK tokenizer
    # tokens = [token for token in word_tokenize(doc) if token.lower() not in stop_words]

    return tokens


In [15]:
# file should be a Path object
def parse_result_file(file, nWords=25):

    search_date, search_engine, search_term, _ = re.split('[_.]',file.name)
    search_site = None
    
    with open(file) as f:
        results = json.load(f)
        lines = []
        if search_engine == 'GOOGLE':
            # results without items in keys are empty
            if 'items' in results.keys():
                for item in results['items']:
                    search_site = item['displayLink']
                    if is_black_listed_site(search_site):
                        continue
                    else:
                        search_site = parse_site(search_site)
                        lines.append(item['title'])
                        if 'snippet' in item.keys():
                            lines.append(item['snippet'])
                        
        elif search_engine == 'BING':
            # results without webPages in keys are empty
            if 'webPages' in results.keys():
                for item in results['webPages']['value'][:10]:
                    search_site = item['url']
                    if is_black_listed_site(search_site):
                        continue
                    else:
                        search_site = parse_site(search_site)
                        lines.append(item['name'])
                        if 'snippet' in item.keys():
                            lines.append(item['snippet'])
                    
        if len(lines) >= 1:
            words = tokenizer(' '.join(lines))
            words_conter = Counter(words)
            wordCloud_dict = {key:value for key,value in words_conter.most_common(nWords)}
            wordCloud_json = json.dumps(wordCloud_dict, ensure_ascii=False)
        else:
            wordCloud_json = ''
        
        wourdCloudInfo_dict = {
            'metaData': {
                'Version': 1,
                'Source': search_engine,
                'Mode': 'API',
                'Fields': ['Name', 'Snippet'],
                'nWords': nWords
            },
            'searchedWord': search_term,
            'cloudOfWords': wordCloud_json
        }
    
        wourdCloudInfo_json = json.dumps(wourdCloudInfo_dict, ensure_ascii=False)
    
        return {'ID': str(uuid.uuid4()),
                'DataHora': datetime.strptime(search_date,RESULT_TS_FORMAT).strftime(OUTPUT_TS_FORMAT),
                'Computador': os.environ['COMPUTERNAME'],
                'Usuário': 'E!', 
                'Homologação': f'{search_term[:5]}-{search_term[5:7]}-{search_term[-5:]}', 
                'Atributo': 'WordCloud',
                'Valor': wourdCloudInfo_json,               
                'Buscadora': search_engine}

# Pesquisa

Nessa etapa serão feitas as pesquisas no Bing e no Google. Os resultados de cada consulta serão armazenados em um arquivo .json contendo a resposta bruta da consulta. 

Os arquivos serão armazenados na pasta *datasets/searchresults* com o seguinte formato de nome: `{data/hora da consulta}\_{mecanismo de busca}\_{nº do certificado de homologação}.json`, onde:
- data/hora da consulta: data e hora que a consulta foi realizada, no formato `%Y%m%d%H%M%S%f`
- mecanismo de busca: `GOOGLE` ou `BING`
- nº do certificado de homologação: número do certificado de homologação (apenas números)

# Tratamento dos dados

Nessa etapa cada arquivo será lido e dele extraído o nome ou título e o resumo (*snippet*) da resposta para consolidar e criar a wordcloud

In [50]:
results_folder = Path('datasets/searchresults')
results_files = [file for file in results_folder.glob('*.json')]
df = pd.DataFrame([parse_result_file(file) for file in results_files[:10]])
df

Unnamed: 0,ID,DataHora,Computador,Usuário,Homologação,Atributo,Valor,Buscadora
0,28f39a79-14a5-41f2-a96b-4879027ea5d6,23/02/2024 15:01:29,ES6927559DTL,E!,03724-22-14637,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
1,e737eebd-d1d8-46ee-b8af-c2635000c15b,23/02/2024 15:01:30,ES6927559DTL,E!,02035-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
2,e6915a1b-aba8-405c-a660-5af517bf36dd,23/02/2024 15:01:31,ES6927559DTL,E!,02018-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
3,1fc341fc-14c5-410b-b4ff-c85e50f6d990,23/02/2024 15:01:31,ES6927559DTL,E!,06618-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
4,45001f65-eefe-4bca-a2ec-7c53282cb8dd,23/02/2024 15:01:32,ES6927559DTL,E!,12303-20-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
5,5c01a800-74a8-4899-b5ba-f75aed6d6a11,23/02/2024 15:01:32,ES6927559DTL,E!,03744-21-13015,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
6,cf50dec7-971e-4695-9c97-d20b9e752945,23/02/2024 15:01:33,ES6927559DTL,E!,10746-20-11685,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
7,df5a6fa3-de27-4e07-b535-5160b660dafe,23/02/2024 15:01:33,ES6927559DTL,E!,13263-20-11685,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
8,194a8ec9-60d0-4ddd-9d16-4184eac589cd,23/02/2024 15:01:34,ES6927559DTL,E!,06776-22-14103,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
9,43790c04-ee01-4d1f-9916-60566f22c328,23/02/2024 15:01:34,ES6927559DTL,E!,13637-21-14103,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",GOOGLE
