In [1]:
import json
import os
from re import compile, findall, escape
import re
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from re import sub
from nltk import download
download('punkt')
download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [96]:
def remove_punct(text):
    text = sub(r"[!#$%&'()*+,-./:;<=>?@[^_`{|}~]+", ' ',text)
    text = sub(r'\s+', ' ',text)
    return text

def extract_keywords(text):
    tokens = word_tokenize(text)
    keywords = []
    for word in tokens:
        word = word.lower()
        if word not in stopwords.words('portuguese') or word.lower() not in STOP_WORDS:
            keywords.append(word)
    return ' '.join(keywords)

def get_synonyms(text):
    tokens = word_tokenize(text)
    synonyms = []
    for word in tokens:
        for syn in wordnet.synsets(word, lang="por"):
            for lemma in syn.lemmas(lang="por"):
                synonyms.append(lemma.name())
    return synonyms

def remove_accent(text):
    text = sub('[áàãâä]', 'a', sub('[éèêë]', 'e', sub('[íìîï]', 'i', sub('[óòõôö]', 'o', sub('[úùûü]', 'u', text)))))
    text = sub(r'\s+', ' ',text)
    return text

def preprocess_lemma(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmas = []
    for token in tokens:
        lemmas.append(lemmatizer.lemmatize(token))
    lemmas = ' '.join(lemmas)
    return lemmas

def preprocess(text, tipo=None):
    text = remove_punct(text)
    text = extract_keywords(text)
    if tipo == 'lemma':
        text = preprocess_lemma(text)
    elif tipo == 'stem':
        pass
    else:
        pass
    text = remove_accent(text)
    return text



def merge_dataframes(df, json_list):
    list_samples = df['samples'].tolist()
    for obj in json_list:
        if 'texts' in obj:
            list_samples += obj['texts']
    new_df = pd.DataFrame({'samples': list_samples})
    return new_df



def check_intervals(lst):
    spans = lst[:]
    remove_indices = []
    for i in range(len(spans)):
        for j in range(i + 1, len(spans)):
            if (
                spans[j][0] <= spans[i][0] <= spans[j][1]
                or spans[j][0] <= spans[i][1] <= spans[j][1]
            ):
                if (spans[i][1] - spans[i][0]) < (spans[j][1] - spans[j][0]):
                    remove_indices.append(i)
                else:
                    remove_indices.append(j)
    corrected_list = [
        span for index, span in enumerate(spans) if index not in remove_indices
    ]
    return corrected_list


def find_words(text, find_tokens):
    result = []
    for token in find_tokens:
        pattern = compile(r'\b{}\b'.format(escape(token)))
        matches = pattern.finditer(text)
        for match in matches:
            dictionary = {
                "text": token,
                "start_index": match.start(),
                "end_index": match.end(),
                # "start_position": len(findall(r'\b\w+\b', text[:match.start()])),
                # "end_position": (len(findall(r'\b\w+\b', text[:match.start()])) + len(token.split())) - 1
            }
            result.append(dictionary)
    return result

def create_json(json_path, content):
    if os.path.isfile(json_path):
        with open(json_path, 'r+', encoding='utf-8') as f:
            data = json.load(f)
            data.append(content)
            f.seek(0)
            json.dump(data, f)
    else:
        with open(json_path, 'w', encoding='utf-8') as f:
            data = [content]
            json.dump(data, f)

In [97]:
def preparing_data(df, json_data, path):
    for index, row in df.iterrows():
        text = preprocess(row['samples']).strip()
        list_words_found = []
        for item in json_data:
            label = preprocess(item["label"], 'lemma').upper()
            keywords = list(set([preprocess(i, 'lemma') for i in item["keywords"] if i != '']))
            found_words = find_words(text, keywords)
            if found_words:
                for i in found_words:
                    i['label'] = label
                    list_words_found.append([i['start_index'],i['end_index'],label])
        list_check = check_intervals(list_words_found)
        if list_check:
            tuple_text = (text,list_check)
            create_json(path, tuple_text)
        else:
            pass


In [98]:
df = pd.read_excel('samples_NER.xlsx')

with open(r'keywords_equipaments.json','r',encoding="utf-8") as f:
    json_data = json.load(f)
    
preparing_data(df,json_data,'json_train_NER.json')