# Desenvolvimento

In [1]:
import json
from pathlib import Path
import flatdict
from collections import Counter
import re
from datetime import datetime
import pandas as pd

In [2]:
# number of words to extract for the word cloud
N_WORDS = 25
# pattern to match words with at least two characters
TOKEN_PATTERN = re.compile(r"\b\w\w+\b")
# version of the word cloud generator
WORD_CLOUD_VERSION = 1
# mode for the word cloud generator
WORD_CLOUD_MODE = "API"
# time format for wordcloud annotation
ANNOTATION_TS_FORMAT = "%d/%m/%Y %H:%M:%S"
# time format for json result files
RESULT_TS_FORMAT = '%Y%m%d%H%M%S%f'

In [3]:
def extract_text(items, fields_of_interest):
    flat_content = flatdict.FlatterDict(items, delimiter="__")
    target_keys = [
        key
        for key in flat_content.keys()
        for word in fields_of_interest
        if word in key
    ]
    text = " ".join([flat_content[key] for key in target_keys])
    return text

In [4]:
def extract_word_counts(text=None, n_words=N_WORDS):
    # Check if the text is empty or None
    if not text:
        return ""
    # Remove punctuation and convert to lowercase
    tokens = [
        token for token in TOKEN_PATTERN.findall(text.lower()) if token.isalpha()
    ]

    # Split the text into words and count occurrences
    words_counter = Counter(tokens)

    # Get the n most common words
    common_words = words_counter.most_common(n_words)
    wordcloud = {key: value for key, value in common_words}
    wordcloud = json.dumps(wordcloud, ensure_ascii=False)

    return wordcloud

In [5]:
def request_wordcloud_from_file(file):
    file = Path(file).absolute()
    filename = file.stem
    ts, source, query, _id = filename.split('_')
    
    with open(file, 'r') as f:
        content = json.load(f)
    
    if source == 'GOOGLE':
        fields_of_interest = ["title", "snippet", "og:title", "og:description"]
        if items := content.get('items'):
            text = extract_text(items, fields_of_interest)
        else: 
            text = ''
    elif source == 'BING':
        fields_of_interest = ["name", "snippet"]
        if webpages := content.get('webPages'):
            if items := webpages.get('value'):
                text = extract_text(items, fields_of_interest)
            else:
                text = ''
        else:
            text = ''
    
    word_counts = extract_word_counts(text)
    if word_counts == "":
        situacao = -1
    else:
        situacao = 1
    
    wordcloud_info = {
        "metaData": {
            "Version": WORD_CLOUD_VERSION,
            "Source": source,
            "Mode": WORD_CLOUD_MODE,
            "Fields": '',
            "n_words": N_WORDS,
        },
        "searchedWord": query,
        "cloudOfWords": word_counts,
    }
    wordcloud_info = json.dumps(wordcloud_info, ensure_ascii=False)
    
    wordcloud_id = _id
    wordcloud_datahora = datetime.strptime(ts,RESULT_TS_FORMAT).strftime(ANNOTATION_TS_FORMAT)
    wordcloud_computername = ""
    wordcloud_username = ""
    wordcloud_homologacao = f"{query[:5]}-{query[5:7]}-{query[7:]}"
    
    wordcloud = {
        "ID": wordcloud_id,
        "DataHora": wordcloud_datahora,
        "Computador": wordcloud_computername,
        "Usuário": wordcloud_username,
        "Homologação": wordcloud_homologacao,
        "Atributo": "WordCloud",
        "Valor": wordcloud_info,
        "Situação": situacao,
    }
    
    return wordcloud

In [8]:
search_history_folder = Path(r'E:\datasets\sch\search_history')
wc = [request_wordcloud_from_file(file) for file in search_history_folder.glob('*.json')]
df = pd.DataFrame(wc)
df

Unnamed: 0,ID,DataHora,Computador,Usuário,Homologação,Atributo,Valor,Situação
0,08b73c58-277c-4508-a901-4762e15da37b,23/02/2024 15:01:31,,,02018-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
1,8c115561-3aec-4d87-b945-af4017e706bc,23/02/2024 15:01:31,,,06618-19-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
2,91574630-05f4-4ed5-a595-422ef8b9a6d8,23/02/2024 15:01:32,,,12303-20-01516,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
3,4c268f8d-4e7b-4c80-8e88-17cf06b015f2,23/02/2024 15:01:32,,,03744-21-13015,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",1
4,1dd58bdc-320b-42a8-aaaf-d117110c5777,23/02/2024 15:01:33,,,10746-20-11685,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
...,...,...,...,...,...,...,...,...
45140,75fc028d-8f87-4d9c-8fd1-006e83e2ddb9,08/04/2025 12:00:19,,,01109-19-03257,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
45141,4b29aa63-022b-4100-9586-6e718a6a1898,08/04/2025 13:00:11,,,02913-13-06206,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
45142,6e0eff40-40ba-4a70-b3d9-f06db5d8c26a,08/04/2025 13:00:12,,,02909-13-06206,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1
45143,bfa1ceaf-f83c-4731-80eb-0e60fd6a46df,08/04/2025 14:00:12,,,02611-15-04477,WordCloud,"{""metaData"": {""Version"": 1, ""Source"": ""GOOGLE""...",-1


In [12]:
file_null_annotation = Path(r'C:\Users\maxwelfreitas\AppData\Local\schwordcloud\datasets\annotation\NullAnnotation.xlsx')
df_null = df[df['Situação']==-1]
df_null['Scarab Post Order'] = -1
df_null.to_excel(file_null_annotation,index=False)

Situação
-1    33891
 1    11254
Name: count, dtype: int64

In [10]:
for w in df[df['Situação']==-1]['Valor'].sample(10):
    print(w)

{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "131352110520", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "070902113927", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "003810903817", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "023852012853", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "007820703864", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "006191300160", "cloudOfWords": ""}
{"metaData": {"Version": 1, "Source": "GOOGLE", "Mode": "API", "Fields": "", "n_words": 25}, "searchedWord": "031081002362", "clou

# Antigos

In [7]:
# file = Path('history/20240223150132752139_GOOGLE_037442113015_4c268f8d-4e7b-4c80-8e88-17cf06b015f2.json').absolute()
# filename = file.stem
# ts, source, query, _id = filename.split('_')

# with open(file, 'r') as f:
#     content = json.load(f)

# if source == 'GOOGLE':
#     items = content['items']
#     text = extract_google_text(items)

# word_counts = extract_word_counts(text)
# if word_counts == "":
#     situacao = -1
# else:
#     situacao = 1

# wordcloud_info = {
#     "metaData": {
#         "Version": WORD_CLOUD_VERSION,
#         "Source": source,
#         "Mode": WORD_CLOUD_MODE,
#         "Fields": '',
#         "n_words": N_WORDS,
#     },
#     "searchedWord": query,
#     "cloudOfWords": word_counts,
# }
# wordcloud_info = json.dumps(wordcloud_info, ensure_ascii=False)

# wordcloud_id = _id
# wordcloud_datahora = datetime.strptime(ts,RESULT_TS_FORMAT).strftime(ANNOTATION_TS_FORMAT)
# wordcloud_computername = ""
# wordcloud_username = ""
# wordcloud_homologacao = f"{query[:5]}-{query[5:7]}-{query[7:]}"

# wordcloud = {
#     "ID": wordcloud_id,
#     "DataHora": wordcloud_datahora,
#     "Computador": wordcloud_computername,
#     "Usuário": wordcloud_username,
#     "Homologação": wordcloud_homologacao,
#     "Atributo": "WordCloud",
#     "Valor": wordcloud_info,
#     "Situação": situacao,
# }

# wordcloud