# Desenvolvimento

In [1]:
import json
from pathlib import Path
import flatdict
from collections import Counter
import re
from datetime import datetime
import pandas as pd

In [2]:
# number of words to extract for the word cloud
N_WORDS = 25
# pattern to match words with at least two characters
TOKEN_PATTERN = re.compile(r"\b\w\w+\b")
# version of the word cloud generator
WORD_CLOUD_VERSION = 1
# mode for the word cloud generator
WORD_CLOUD_MODE = "API"
# time format for wordcloud annotation
ANNOTATION_TS_FORMAT = "%d/%m/%Y %H:%M:%S"
# time format for json result files
RESULT_TS_FORMAT = '%Y%m%d%H%M%S%f'

In [3]:
def extract_text(items, fields_of_interest):
    flat_content = flatdict.FlatterDict(items, delimiter="__")
    target_keys = [
        key
        for key in flat_content.keys()
        for word in fields_of_interest
        if word in key
    ]
    text = " ".join([flat_content[key] for key in target_keys])
    return text

In [4]:
def extract_word_counts(text=None, n_words=N_WORDS):
    # Check if the text is empty or None
    if not text:
        return ""
    # Remove punctuation and convert to lowercase
    tokens = [
        token for token in TOKEN_PATTERN.findall(text.lower()) if token.isalpha()
    ]

    # Split the text into words and count occurrences
    words_counter = Counter(tokens)

    # Get the n most common words
    common_words = words_counter.most_common(n_words)
    wordcloud = {key: value for key, value in common_words}
    wordcloud = json.dumps(wordcloud, ensure_ascii=False)

    return wordcloud

In [5]:
def request_wordcloud_from_file(file):
    file = Path(file).absolute()
    filename = file.stem
    ts, source, query, _id = filename.split('_')
    
    with open(file, 'r') as f:
        content = json.load(f)
    
    if source == 'GOOGLE':
        fields_of_interest = ["title", "snippet", "og:title", "og:description"]
        if items := content.get('items'):
            text = extract_text(items, fields_of_interest)
        else: 
            text = ''
    elif source == 'BING':
        fields_of_interest = ["name", "snippet"]
        if webpages := content.get('webPages'):
            if items := webpages.get('value'):
                text = extract_text(items, fields_of_interest)
            else:
                text = ''
        else:
            text = ''
    
    word_counts = extract_word_counts(text)
    if word_counts == "":
        situacao = -1
    else:
        situacao = 1
    
    wordcloud_info = {
        "metaData": {
            "Version": WORD_CLOUD_VERSION,
            "Source": source,
            "Mode": WORD_CLOUD_MODE,
            "Fields": '',
            "n_words": N_WORDS,
        },
        "searchedWord": query,
        "cloudOfWords": word_counts,
    }
    wordcloud_info = json.dumps(wordcloud_info, ensure_ascii=False)
    
    wordcloud_id = _id
    wordcloud_datahora = datetime.strptime(ts,RESULT_TS_FORMAT).strftime(ANNOTATION_TS_FORMAT)
    wordcloud_computername = ""
    wordcloud_username = ""
    wordcloud_homologacao = f"{query[:5]}-{query[5:7]}-{query[7:]}"
    
    wordcloud = {
        "ID": wordcloud_id,
        "DataHora": wordcloud_datahora,
        "Computador": wordcloud_computername,
        "Usuário": wordcloud_username,
        "Homologação": wordcloud_homologacao,
        "Atributo": "WordCloud",
        "Valor": wordcloud_info,
        "Situação": situacao,
    }
    
    return wordcloud

In [8]:
wc = [request_wordcloud_from_file(file) for file in Path('history').glob('*.json')]
df = pd.DataFrame(wc)
df

Unnamed: 0,ID,DataHora,Computador,Usuário,Homologação,Atributo,Valor,Situação
0,08b73c58-277c-4508-a901-4762e15da37b,23/02/2024 15:01:31,,,02018-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
1,8c115561-3aec-4d87-b945-af4017e706bc,23/02/2024 15:01:31,,,06618-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
2,91574630-05f4-4ed5-a595-422ef8b9a6d8,23/02/2024 15:01:32,,,12303-20-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
3,4c268f8d-4e7b-4c80-8e88-17cf06b015f2,23/02/2024 15:01:32,,,03744-21-13015,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",1
4,1dd58bdc-320b-42a8-aaaf-d117110c5777,23/02/2024 15:01:33,,,10746-20-11685,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
5,72c05548-662e-4a65-803a-cf8608997a26,23/02/2024 15:01:33,,,13263-20-11685,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
6,cf5cc799-636e-47e7-9d21-56eee2ed9a4a,23/02/2024 15:01:34,,,06776-22-14103,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
7,1c575af4-a2ac-4158-94e4-7847846678b2,23/02/2024 15:01:34,,,13637-21-14103,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",1
8,ac1e036d-0907-4857-8435-702832362805,01/03/2024 12:01:43,,,02035-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""BING"", ...",-1
9,7b1bcb78-b578-4e3e-9305-06a02e18ff6e,01/03/2024 12:01:44,,,02018-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""BING"", ...",-1


In [7]:
for w in df['Valor']:
    print(w)

{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "020181901516", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "066181901516", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "123032001516", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "037442113015", "cloudOfWords": "{\"smartwatch\": 29, \"relógio\": 27, \"ultra\": 23, \"de\": 14, \"pro\": 12, \"puls\": 10, \"na\": 10, \"gps\": 9, \"nfc\": 9, \"com\": 9, \"max\": 9, \"microwear\": 9, \"americanas\": 9, \"inteligente\": 9, \"watch\": 8, \"anatel\": 8, \"em\": 8, \"película\": 7, \"tela\": 6, \"homologação\": 6, \"ficha\": 6, \"técnica\": 6, \"da\": 6, \"empresas\": 6, \"ocean\": 5}"}
{"metaData": {"Version": 1, "Source": "GOOGLE", "M

# Antigos

In [7]:
# file = Path('history/20240223150132752139_GOOGLE_037442113015_4c268f8d-4e7b-4c80-8e88-17cf06b015f2.json').absolute()
# filename = file.stem
# ts, source, query, _id = filename.split('_')

# with open(file, 'r') as f:
#     content = json.load(f)

# if source == 'GOOGLE':
#     items = content['items']
#     text = extract_google_text(items)

# word_counts = extract_word_counts(text)
# if word_counts == "":
#     situacao = -1
# else:
#     situacao = 1

# wordcloud_info = {
#     "metaData": {
#         "Version": WORD_CLOUD_VERSION,
#         "Source": source,
#         "Mode": WORD_CLOUD_MODE,
#         "Fields": '',
#         "n_words": N_WORDS,
#     },
#     "searchedWord": query,
#     "cloudOfWords": word_counts,
# }
# wordcloud_info = json.dumps(wordcloud_info, ensure_ascii=False)

# wordcloud_id = _id
# wordcloud_datahora = datetime.strptime(ts,RESULT_TS_FORMAT).strftime(ANNOTATION_TS_FORMAT)
# wordcloud_computername = ""
# wordcloud_username = ""
# wordcloud_homologacao = f"{query[:5]}-{query[5:7]}-{query[7:]}"

# wordcloud = {
#     "ID": wordcloud_id,
#     "DataHora": wordcloud_datahora,
#     "Computador": wordcloud_computername,
#     "Usuário": wordcloud_username,
#     "Homologação": wordcloud_homologacao,
#     "Atributo": "WordCloud",
#     "Valor": wordcloud_info,
#     "Situação": situacao,
# }

# wordcloud