## Terminology - Project
Authors: Cécile MACAIRE & Ludivine ROBERT 

## Librairies 

In [1]:
import pandas as pd
import spacy
spacy_nlp = spacy.load('en_core_web_sm')
from termcolor import colored

## Code 

In [7]:
# Read data from lexicon and store in dataframe
def read_data(file):
    """Read data file with pandas dataframe"""
    return pd.read_csv(file, sep='\t')

def lemma_lexicon(dataframe):
    terms = dataframe['pilot']
    lemma = []
    for el in terms:
        doc = spacy_nlp(el)
        tmp = [token.lemma_ for token in doc]
        lemma.append(' '.join(tmp))
    dataframe['lemma'].replace(lemma)
    return dataframe
  
def select_data(dataframe):
    """We keep only columns pattern, pilot and lemma"""
    return dataframe[['pattern', 'pilot', 'lemma']]

In [8]:
def read_file(file):
    with open(file, 'r') as f:
        return f.read()
        
def lemma_posttag(file):
    text = read_file(file)
    doc = spacy_nlp(text)
    tokens = [token.text for token in doc]
    new_pos = []
    pos = []
    lemma = []
    t = []
    for token in doc:
        t.append(token.text)
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        if token.pos_ == 'NOUN':
            new_pos.append('N')
        elif token.pos_ == 'VERB':
            new_pos.append('V')
        elif token.pos_ == 'ADJ':
            new_pos.append('A')
        elif token.pos_ == 'CCONJ' or token.pos_ == 'SCONJ':
            new_pos.append('C')
        elif token.pos_ == 'PART' or token.pos_ == 'ADP':
            new_pos.append('P')
        else:
            new_pos.append('')
    frame = pd.DataFrame({'tokens':t, 'lemma':lemma, 'pos':pos, 'pattern':new_pos})
    return frame

In [9]:
# Link terms with text
def annotate(terms_dataframe, text_dataframe):
    rules(terms_dataframe, text_dataframe)
    for i, token in enumerate(text_dataframe['lemma']):
        for term in terms_dataframe['lemma']:
            term = term.split(' ')
            # Cas pour 4
            if len(term) == 4:
                term_1 = term[0]
                if token == term_1 and len(text_dataframe['lemma']) > i+4:
                    if text_dataframe['lemma'][i+1] == term[1] and text_dataframe['lemma'][i+2] == term[2] and text_dataframe['lemma'][i+3] == term[3]:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+3] = text_dataframe['tokens'][i+3]+']'
                        i += 3
            elif len(term) == 3:
                term_1 = term[0]
                if token == term_1 and len(text_dataframe['lemma']) > i+3:
                    if text_dataframe['lemma'][i+1] == term[1] and text_dataframe['lemma'][i+2] == term[2]:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
                        i += 2
            elif len(term) == 2:
                term_1 = term[0]
                if token == term_1 and len(text_dataframe['lemma']) > i+2:
                    if text_dataframe['lemma'][i+1] == term[1]:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+1] = text_dataframe['tokens'][i+1]+']'
                        i += 1
            elif token == term[0]:
                text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]+']'
    return text_dataframe

In [10]:
def rules(terms_dataframe, text_dataframe):
    """Define rules for a lemma according to its pattern"""
    new_terms = []
    for terms in terms_dataframe['lemma']:
        tmp = ' '.join(terms.split('-'))
        new_terms.append(tmp.split(' '))
    for i, token in enumerate(text_dataframe['lemma']):
        for j, t in enumerate(new_terms):
            if len(t) == 3 and len(text_dataframe['lemma']) >= i+3:
                if token == t[0] and text_dataframe['lemma'][i+1] == '-' and text_dataframe['lemma'][i+2] == 'to' and text_dataframe['lemma'][i+3] == '-' and text_dataframe['lemma'][i+4] == t[2]:
                    if text_dataframe['pattern'][i+5] == 'N':
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+5] = text_dataframe['tokens'][i+5]+']'
                    else:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+4] = text_dataframe['tokens'][i+4]+']'
            elif len(t) >= 2 and len(text_dataframe['lemma']) >= i+3:
                if token == t[0] and text_dataframe['lemma'][i+1] == '-' and text_dataframe['lemma'][i+2] == t[1]:
                    if len(t) == 5:
                        if text_dataframe['pattern'][i+3] == 'N' and text_dataframe['pattern'][i+4] == 'N' and text_dataframe['pattern'][i+5] == 'N':
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+5] = text_dataframe['tokens'][i+5]+']'
                    elif len(t) == 4:
                        if text_dataframe['pattern'][i+3] == 'N' and text_dataframe['pattern'][i+4] == 'N':
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+4] = text_dataframe['tokens'][i+4]+']'
                    elif len(t) == 3:
                        if text_dataframe['pattern'][i+3] == 'N':
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+3] = text_dataframe['tokens'][i+3]+']'
                    elif len(t) == 2:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'

In [11]:
if __name__== "__main__":
    init_data = read_data('tts-lexicon2.tsv')
    change_lemma = lemma_lexicon(init_data)
    data = select_data(change_lemma)
    text_dataframe = lemma_posttag('test.txt')
    print(annotate(data, text_dataframe))
    print(' '.join(text_dataframe['tokens'].to_list()))

       tokens       lemma    pos pattern
0       [[hmm         hmm   INTJ        
1           -           -  PUNCT        
2       based        base   VERB       V
3  generation  generation   NOUN       N
4  synthesis]   synthesis   NOUN       N
5   approach]    approach   NOUN       N
6          \n          \n  SPACE        
[[hmm - based generation synthesis] approach] 

