# Funções úteis

In [5]:
import json
import pandas as pd
import os

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from re import sub
from nltk import download
download('punkt')
download('stopwords')

import openai

from re import compile, findall, escape

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def create_json(json_path, content):
    if os.path.isfile(json_path):
        with open(json_path, 'r+', encoding='utf-8') as f:
            data = json.load(f)
            data.append(content)
            f.seek(0)
            json.dump(data, f, indent=4)
    else:
        with open(json_path, 'w', encoding='utf-8') as f:
            data = [content]
            json.dump(data, f, indent=4)

def load_json(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def load_df(file):
    df = pd.read_excel(file)
    return df



def remove_punct(text):
    text = sub(r"[!#$%&'()*+,-./:;<=>?@[^_`{|}~]+", ' ',text)
    text = sub(r'\s+', ' ',text)
    return text

def extract_keywords(text):
    tokens = word_tokenize(text)
    keywords = []
    for word in tokens:
        word = word.lower()
        if word not in stopwords.words('portuguese') or word.lower() not in STOP_WORDS:
            keywords.append(word)
    keywords = ' '.join(keywords)
    return keywords

def preprocess_stem(text):
    stemmer = SnowballStemmer("portuguese")
    stems = []
    tokens = word_tokenize(text)
    for token in tokens:
        stems.append(stemmer.stem(token))
    stems = ' '.join(stems)
    return str(stems)

def remove_accent(text):
    text = sub('[áàãâä]', 'a', sub('[éèêë]', 'e', sub('[íìîï]', 'i', sub('[óòõôö]', 'o', sub('[úùûü]', 'u', text)))))
    text = sub(r'\s+', ' ',text)
    return text

def preprocess(text):
    text = remove_punct(text)
    text = extract_keywords(text)
    text = preprocess_stem(text)
    text = remove_accent(text)
    return text



def chatgpt(txt):
  openai.api_key = "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"
  completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "user", "content": txt}
    ]
  )
  response = completion.choices[0].message.content
  return response


def find_words(text, find_tokens):
    result = []
    for token in find_tokens:
        pattern = compile(r'\b{}\b'.format(escape(token)))
        matches = pattern.finditer(text)
        for match in matches:
            dictionary = {
                "text": token,
                "start_index": match.start(),
                "end_index": match.end() - 1,
                "start_position": len(findall(r'\b\w+\b', text[:match.start()])),
                "end_position": (len(findall(r'\b\w+\b', text[:match.start()])) + len(token.split())) - 1
            }
            result.append(dictionary)
    return result

# Rotina de Treino

### Rotulagem de dados para treino:

In [3]:
import time
import random
import re

In [13]:
def data_generation(df, column_label, column_keyword, qtd_samples, qtd_values_per_req, num_prompt, path):
    for idx, row in df.iterrows():
        keywords = row[column_keyword].split(r'.')
        new_texts = []
        while len(new_texts) < qtd_samples:
            for word in keywords:
                
                prompt_text_tec = f"faça {qtd_values_per_req} exemplos de textos tecnicos de engenharia/industria sobre o seguinte tema: {word}. retorne no seguinte formato: lista em python, exemplo: ['exemplo1','exemplo2'...]. ps(retorne somente o corpo da lista somente com as strings dos exemplos), não envie textos complementares!"
                prompt_problem = f"simule ser um cliente de uma empresa de manutenção industrial. faça {qtd_values_per_req} exemplos de textos que descrevem/relatam o seguinte problema: {word}. retorne no seguinte formato: lista em python, exemplo: ['exemplo1','exemplo2'...]. ps(retorne somente o corpo da lista somente com as strings dos exemplos), não envie textos complementares!"
                prompts = [prompt_text_tec, prompt_problem]

                retry = True
                while retry:
                    try:
                        response = chatgpt(prompts[num_prompt])
                    except:
                        time.sleep(60)
                        response = chatgpt(prompts[num_prompt])
                    try:
                        new_texts += eval(response)
                        retry = False
                    except:
                        continue
        if len(new_texts) > qtd_samples:
            new_texts = random.sample(new_texts, qtd_samples)
        dict_ = {'classe':row[column_label], 'texts':new_texts}
        create_json(path,dict_)


def check_intervals(lst):
    spans = []
    for current_interval in lst:
        start, end, label = current_interval
        current_span = (start, end, label)
        overlap = False
        for existing_span in spans:
            if existing_span[0] <= start < existing_span[1] or existing_span[0] < end <= existing_span[1]:
                overlap = True
                break
        if not overlap:
            spans.append(current_span)
    corrected_list = [[start, end, label] for start, end, label in spans]
    return corrected_list

def preparing_data(df, column_label, column_keyword, path, labels_with_texts):
    words = []
    for idx, row in df.iterrows():
        list_words = re.split(r',|;|.', row[column_keyword])
        for i in range(len(list_words)):
            list_words[i] = preprocess(list_words[i]).strip()
        words.append(list_words)
    dict_train = {}
    for i in range(len(labels_with_texts)):
        list_tuple = []
        for text in labels_with_texts[i]["texts"]:
            text = preprocess(text).strip()
            list_find_word = find_words(text,words[i])
            list_find = []
            for j in list_find_word:
                list_word_found = [j['start_index'],j['end_index']+1,str(labels_with_texts[i][column_label])]
                list_find.append(list_word_found)
            list_tuple.append((text, {"entities": list_find}))
        dict_train[str(labels_with_texts[i][column_label])] = list_tuple

        for values in dict_train.values():
            for items in values:
                for inner_values in items[1].values():
                    new_value = check_intervals(inner_values)
                    items[1]['entities'] = new_value

        create_json(path, dict_train)


### Treinar o modelo

In [None]:
!python -m spacy download "pt_core_news_sm"

In [None]:
from spacy.util import minibatch, compounding
from spacy import blank, training, load
from pathlib import Path
nlp = load("pt_core_news_sm")

In [None]:
# Exemplo de dado de entrada
# data_dict = {
#     "BOMBA": [
#     ("Bomba centrífuga", {"entities": [(0, 5, "BOMBA")]}),
#     ]}

In [None]:
def save_model(model, path_model):
    path_model = Path(path_model)
    if not path_model.exists():
        path_model.mkdir()
    model_path = path_model
    model.to_disk(model_path)
    print("Model saved to:", model_path)

def train_model(data_dict, iterations, path_model):
    nlp = blank("pt")
    nlp.add_pipe("ner", name="ner", last=True)

    for label in data_dict.keys():
        nlp.get_pipe("ner").add_label(label)
    train_data = []
    for label, examples in data_dict.items():
        for text, annotations in examples:
            train_data.append((text, annotations))
    nlp.begin_training()

    for itn in range(iterations):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            example_batch = []
            for text, annotation in zip(texts, annotations):
                doc = nlp.make_doc(text)
                example = training.example.Example.from_dict(doc, annotation)
                example_batch.append(example)
            nlp.update(example_batch, losses=losses)
        print("Iteration:", itn+1, "Loss:", losses)

    save_model(nlp, path_model)

# Rotina de teste

### Rotulagem de dados para teste

In [16]:
df = load_df(r'..\portugues\pt_troubleshooting.xlsx')
df = df[['problem','equipament']].drop_duplicates()
prepared_df = df.applymap(preprocess)
data_generation(prepared_df,'equipament','problem',10,5,1,'teste.json')

# Testar o modelo

In [1]:
from spacy import load, displacy

In [20]:
def classifier(text, path):
    nlp = load(path)
    doc = nlp(text)
    # labels = [(entidade.text,entidade.label_) for entidade in doc.ents]
    labels = displacy.render(doc, style="ent")
    return labels

text = preprocess("1. As valvulas de gaveta sao ideais para aplicações onde ha a necessidade de um fluxo totalmente obstruido. Possuem tambem um alto desempenho quanto a vedaçao. 1. bomba e Os rolamentos são componentes essenciais em maquinas rotativas, garantindo a redução do atrito e o suporte de cargas. Eles podem ser classificados em diversos tipos, como o rolamento de esferas, o rolamento de rolos, o rolamento autocompensador de esferas, entre outros.  Os acionamentos por corrente em v sadadsa de filtro de óleo para garantir o funcionamento de motores elétricos")
a = classifier(text, r"NER_equipament")
a