In [17]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler 
from spacy.lang.en import English
from spacy import displacy

data = pd.read_csv("terms_cleaned.csv", sep=",")[['pattern', 'pilot']]
data = data.drop_duplicates()

In [15]:
new_data = pd.DataFrame(
    {"pattern" : [], "pilot" : []}
)
data.pattern.unique()

array(['N', 'N N', 'A N', 'A N N', 'N N N', 'A C A N', 'A N P N',
       'N C N P N', 'A A N', 'A', 'A N N N', 'N P N', 'N P A N',
       'N P N N', 'R', 'N A N', 'N C N N', 'A A N N', 'N V A', 'A N A N',
       'N C A N', 'N N N N', 'A N P N N', 'N N P N', 'N P N P N',
       'A A A N', 'R A N', 'A C A N N'], dtype=object)

In [16]:
data[data['pattern'] == 'N V A']
len(data)

3260

# ACAN to AN

In [4]:
ACAN_rows = data[data['pattern'] == 'A C A N']
for index, row in ACAN_rows.iterrows():
    first_elt = " ".join([row['pilot'].split(" ")[0],row['pilot'].split(" ")[3]])
    second_elt = " ".join(row['pilot'].split(" ")[-2:])
    new_data = new_data.append({'pattern': 'A N', 'pilot' : first_elt}, ignore_index=True)
    new_data = new_data.append({'pattern': 'A N', 'pilot' : second_elt}, ignore_index=True)

data = data.append(new_data, ignore_index=True).drop_duplicates()
data

Unnamed: 0,pattern,pilot
0,N,annotations
1,N N,emotion recognition
2,A N,emotional recognition
3,A N N,speech-based emotion recognition
4,A N N,automated emotion recognition
...,...,...
1716,A N,implicit manner
1717,A N,dull responses
1718,A N,generic responses
1719,A N,mental health


# Lemmatization

In [5]:
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB 
nlp = spacy.load('en')

lemmatizer = nlp.vocab.morphology.lemmatizer

lemmas = []
for index ,row in data.iterrows():
    lemmas.append((lemmatizer(row['pilot'], NOUN)[0], row['pattern']))

lemmas_df = pd.DataFrame(
    {"pattern" : [lemma[1] for lemma in lemmas], "pilot" : [lemma[0] for lemma in lemmas]}
)
lemmas_df
data = data.append(lemmas_df, ignore_index=True).drop_duplicates()
data

Unnamed: 0,pattern,pilot
0,N,annotations
1,N N,emotion recognition
2,A N,emotional recognition
3,A N N,speech-based emotion recognition
4,A N N,automated emotion recognition
...,...,...
3415,N P N N,range of story progres
3426,A N,male emotion
3427,A N,female emotion
3430,A N,dull response


# Find plural of terms

In [6]:
import inflect
engine = inflect.engine()
plural = engine.plural("male emotion")
plural

'male emotions'

In [7]:
plurals = []
for index ,row in data.iterrows():
    plurals.append((engine.plural(row['pilot']), row['pattern']))

plurals_df = pd.DataFrame(
    {"pattern" : [plural[1] for plural in plurals], "pilot" : [plural[0] for plural in plurals]}
)
data = data.append(plurals_df, ignore_index=True).drop_duplicates()
#data

In [8]:
#save as csv
data.to_csv('terms.csv', index=False)

## rule-based system

In [42]:
patterns = []
for index, row in data.iterrows():
    patterns.append(
        {"label" : "NLP term", "pattern" : row["pilot"]}
    )
nlp = English()
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
#patterns

#### some exemple

In [19]:
doc = nlp("The corpus is a data set, an annotator is given")

In [20]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

corpus 4 10 NLP term
data set 16 24 NLP term
annotator 29 38 NLP term


In [21]:
displacy.render(doc, style="ent", jupyter = True)

# neural network approach

### extract text from train articles

In [33]:
import os
Sentences = []
for filename in os.listdir('main18/'):
    text = open('main18/'+filename, 'r')
    sentences = text.read().split(".")
    Sentences += sentences

Sentences = [sentence.replace('\n', ' ') for sentence in Sentences ]

In [34]:
TRAIN_DATA=[]
for sentence in Sentences:
    doc = nlp(sentence)
    TRAIN_DATA.append(['',{'entities' : []}])
    for ent in doc.ents:
        TRAIN_DATA[-1][1]['entities'].append((ent.start_char, ent.end_char, ent.label_))
    TRAIN_DATA[-1] = (sentence, TRAIN_DATA[-1][1])

i = 0
while i<len(TRAIN_DATA):
    if TRAIN_DATA[i][0] == '' or len(TRAIN_DATA[i][1]['entities']) == 0:
        TRAIN_DATA.pop(i)
    else:
        i += 1

In [35]:
import spacy
import random
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score

In [36]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
# add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
# get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp
prdnlp = train_spacy(TRAIN_DATA, 40)


  **kwargs


Statring iteration 0
{'ner': 11116.020741332337}
Statring iteration 1
{'ner': 7772.422429557916}
Statring iteration 2
{'ner': 6894.399181174071}
Statring iteration 3
{'ner': 5987.574831980829}
Statring iteration 4
{'ner': 5483.9327484651585}
Statring iteration 5
{'ner': 5040.783829466195}
Statring iteration 6
{'ner': 4629.612497414617}
Statring iteration 7
{'ner': 4332.434756173612}
Statring iteration 8
{'ner': 4008.6702479204596}
Statring iteration 9
{'ner': 3828.6495593355817}
Statring iteration 10
{'ner': 3601.5171727655297}
Statring iteration 11
{'ner': 3164.35928676709}
Statring iteration 12
{'ner': 3225.916890523705}
Statring iteration 13
{'ner': 3037.053566150275}
Statring iteration 14
{'ner': 2745.627204562233}
Statring iteration 15
{'ner': 2758.500604944274}
Statring iteration 16
{'ner': 2583.354730319179}
Statring iteration 17
{'ner': 2463.4011183202724}
Statring iteration 18
{'ner': 2254.3209167610135}
Statring iteration 19
{'ner': 2089.409261751411}
Statring iteration 20
{'

In [37]:
# Save our trained Model
modelfile = "neural network annotator"
prdnlp.to_disk(modelfile)

# Testing the two systems

### Loading test articles

In [66]:
import os
import re

articles = []
for filename in os.listdir('test2/'):
    text = open('test2/'+filename, 'r')
    articles.append(text.read().replace('\n', ' '))
    text.close()
articles = " ".join(articles)
articles = re.sub(' +', ' ', articles) 

#true results

test_true = pd.read_csv("test_cleaned.tsv", sep="\t")[['pilot','freq']]
test_true

['RAVDESS',
 'Enc-bef',
 'corpus',
 'speech recognition',
 'automatic speech recognition',
 'Enc-aft',
 'CREMA-D',
 'datasets',
 'decoder',
 'emotional speech',
 'manual transcripts',
 'encoder',
 'Speaker Identity',
 'thankfulness',
 'Seq2Seq models',
 'LSTM',
 'MSP-IMPROV Target',
 'MSP-IMPROV Target RAVDESS',
 'Actor Identity',
 'speech recognition performance',
 'Confusion matrix of model',
 'dialogue generation',
 'Automatic Dialogue Generation',
 'conversational agent',
 'Seq2Seq',
 'dataset',
 'statistical significance',
 'Statistical Significance of Emotion',
 'dialogue models',
 'emotion classifier',
 'emotion mining classifier',
 'neutral speech',
 'target utterances',
 'vector',
 'APIs',
 'automatic speech recognition',
 'emotion mining',
 'emotion mining classifier',
 'recognition accuracy',
 'lexical content',
 'recognition systems',
 'Word Error Rate',
 '1-CR',
 'emotion mining classifier',
 'trainable',
 'emotional content',
 'RAVDESS and MSP-IMPROV Target',
 'ASR system

### Rule-based system

In [74]:
import collections

doc = nlp(articles)
entities = []
for ent in doc.ents:
    #print(ent.text, ent.start_char, ent.end_char, ent.label_)
    entities.append(ent.text)

RB_results = collections.Counter(entities)
#results

displacy.render(doc, style="ent", jupyter = True)

### Supervised trained model system

In [75]:
import spacy
import os

prdnlp = spacy.load("neural network annotator") # 
doc = prdnlp(articles)
for ent in doc.ents:
    entities.append(ent.text)

NN_results = collections.Counter(entities)
displacy.render(doc, style="ent", jupyter = True)

#### Showing some differences on the results given by the two systems

In [83]:
import plotly.graph_objects as go
import random

terms = [random.choice(list(test_true.pilot.values)) for i in range(15)]
fig = go.Figure(data=[
    go.Bar(name='Rule-based model', x=terms, y=[RB_results[term] for term in terms]),
    go.Bar(name='Supervised model', x=terms, y=[NN_results[term] for term in terms]),

])

fig.update_layout(barmode='group')
fig.show()