<a href="https://colab.research.google.com/github/Heshamovic/GP-NLP-Movies-Queries-Entity-Extraction/blob/master/Entity_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Entity Extraction***
In this notebook we will illustrate how our model work to extract entity extraction for movies queries.

We used [this blog](https://nlpforhackers.io/named-entity-extraction/) for helping in implementing our entity extraction.

Our data didn't have all the indormation for the model to be trained. So, first we run the data on [Groningen Meaning Bank Corprus version 2.2.0](https://gmb.let.rug.nl/data.php) to fill the missing part in our data. 

# ***Imports***

In [0]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags, ChunkParserI
from nltk.stem.snowball import SnowballStemmer 
from nltk.tag import ClassifierBasedTagger
import os
import collections
import string
import pickle
import requests 
from collections import Iterable

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')


# **Loading Data**
We will use 2 datasets for this model gmb-2.2.0 to extract the NER for each sentence and [this data](https://groups.csail.mit.edu/sls/downloads/movie/) for movie queries intent.

First we will need to download the gmb-2.2.0 zipped file.




In [0]:
file_url = "https://gmb.let.rug.nl/releases/gmb-2.2.0.zip"
    
r = requests.get(file_url, stream = True)  

with open("/content/drive/My Drive/gmb-2.2.0.zip", "wb") as file:  
    for block in r.iter_content(chunk_size = 1024): 
         if block:  
             file.write(block)  

!unzip '/content/drive/My Drive/gmb-2.2.0.zip'

# ***Preprocessing***

In [0]:
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
 
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
    history = ['[START2]', '[START1]'] + list(history)
 
    # shift the index with 2, to accommodate the padding
    index += 2
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    previob = history[index - 1]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])
 
    allcaps = word == word.capitalize()
    capitalized = word[0] in string.ascii_uppercase
 
    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase
 
    nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = prevword[0] in string.ascii_uppercase
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
 
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
 
        'prev-iob': previob,
 
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
 
        'all-caps': allcaps,
        'capitalized': capitalized,
 
        'prev-all-caps': prevallcaps,
        'prev-capitalized': prevcapitalized,
 
        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
    }
 

In [0]:
def to_conll_iob(annotated_sentence):
    """
    `annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
    Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
    to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
    """
    proper_iob_tokens = []
    for idx, annotated_token in enumerate(annotated_sentence):
        tag, word, ner = annotated_token
 
        if ner != 'O':
            if idx == 0:
                ner = "B-" + ner
            elif annotated_sentence[idx - 1][2] == ner:
                ner = "I-" + ner
            else:
                ner = "B-" + ner
        proper_iob_tokens.append((tag, word, ner))
    return proper_iob_tokens
    
def read_gmb(corpus_root):
    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            if filename.endswith(".tags"):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    file_content = file_handle.read().decode('utf-8').strip()
                    annotated_sentences = file_content.split('\n\n')
                    for annotated_sentence in annotated_sentences:
                        annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]
 
                        standard_form_tokens = []
 
                        for idx, annotated_token in enumerate(annotated_tokens):
                            annotations = annotated_token.split('\t')
                            word, tag, ner = annotations[0], annotations[1], annotations[3]
 
                            if ner != 'O':
                                ner = ner.split('-')[0]
                            if tag in ('LQU', 'RQU'):   # Make it NLTK compatible
                                tag = "``"
 
                            standard_form_tokens.append((word, tag, ner))
 
                        conll_tokens = to_conll_iob(standard_form_tokens)
                        # Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...]
                        # Because the classfier expects a tuple as input, first item input, second the class
                        yield [((w, t), iob) for w, t, iob in conll_tokens]

In [0]:
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
 
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs)
 
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
 
        # Transform the result from [((w1, t1), iob1), ...] 
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
 
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
 

In [0]:
def prepareSentence(s):
    ans = []
    sent = ''
    for i in s:
        sent = sent + i[0] + ' ' 
    t = chunker.parse(pos_tag(word_tokenize(sent[:-1])))
    for i, j in zip(s, t):
        x = ''
        if len(j) > 1:
            x = j[1]
        else:
            start = False
            for k in j[0]:
                if start:
                    x = x + k
                if k == '/':
                    start = True
        l = [i[0], x, i[1]]
        ans.append(l)
    return ans

In [0]:
def prepare_movies_data():
    with open('/content/drive/My Drive/Colab Notebooks/Train.txt', 'r') as f:
        x = f.readlines()
    X_train = []
    s = []
    for i in x:
        if len(i.split()) < 2:
            X_train.append(prepareSentence(s))
            s = []
            continue
        s.append((i.split()[1], i.split()[0]))
    return X_train 

In [0]:
def read_movies_data(x_train):
    for sentence in x_train:
        standard_form_tokens = []
        for w in sentence:
            word, tag, ner = w[0], w[1], w[2]
            if ner != 'O':
                ner = ner.split('-')[1]
            if tag in ('LQU', 'RQU'):   # Make it NLTK compatible
                tag = "``"
 
            standard_form_tokens.append((word, tag, ner))
        conll_tokens = to_conll_iob(standard_form_tokens)
        # Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...]
        # Because the classfier expects a tuple as input, first item input, second the class
        yield [((w, t), iob) for w, t, iob in conll_tokens]


In [0]:
corpus_root = "gmb-2.2.0"
reader = read_gmb(corpus_root)
data = list(reader)
training_ratio = 0.7
training_samples = data[:int(len(data) * training_ratio)]
test_samples = data[int(len(data) * training_ratio):]
chunker = NamedEntityChunker(training_samples)
    
x_train = prepare_movies_data()
reader = read_movies_data(x_train)
data = list(reader)
    

# ***Training***

We combined all the data found on the link provided before to get large data of about 20000 sentence and tried several training to testing ratio and 84% was the one with best accuracy

In [0]:
training_ratio = 0.84
training_samples = data[:int(len(data) * training_ratio)]
test_samples = data[int(len(data) * training_ratio):]
chunker = NamedEntityChunker(training_samples)

# **Testing**

In [12]:
score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_samples])
print("Training Ratio: {0}, Accuracy: {1}".format(training_ratio, score.accuracy()))

Training Ratio: 0.84, Accuracy: 0.7713279581885975


Enter any query to see its entity extraction tree

In [13]:
print(chunker.parse(pos_tag(word_tokenize("show me romantic comedy movies"))))

(S show/VB me/PRP (GENRE romantic/JJ comedy/NN) movies/NNS)
