# PHRASE EXTRACTOR

### Importing necessary packages

In [358]:
import nltk
import csv
import pandas as pd
import numpy as np

### Woking on the training data and trying to figure out the best grammar possible which fits for most of the data.

In [359]:
# Reading Training file.
data = pd.read_csv("training_data.tsv", delimiter = '\t', encoding = 'utf-8')

In [360]:
# Checking out the length of file
len(data)

9819

In [361]:
# Checking out any random sentence from the training data-set
sentence = data['sent'][10]
sentence

'Set a reminder on 4 th Dec of going to meet sonal miss at 2:00 pm'

### Checking how TextBlob works

In [362]:
from textblob import TextBlob

# Textblob has ready-made function to extract noun-phrases. Lets see the results
print ("TEXTBLOB")
blob = TextBlob(sentence)

for np in blob.noun_phrases:
    print (np)
    
# The results were bad and thus had to go for different approach

TEXTBLOB
set
dec


In [363]:
# Tokenizing sentence into individual words
tokens = nltk.word_tokenize(sentence)
tokens

['Set',
 'a',
 'reminder',
 'on',
 '4',
 'th',
 'Dec',
 'of',
 'going',
 'to',
 'meet',
 'sonal',
 'miss',
 'at',
 '2:00',
 'pm']

In [364]:
# POS_Tagging : Every word is assigned a tag
tagged = nltk.pos_tag(tokens)
tagged

[('Set', 'VB'),
 ('a', 'DT'),
 ('reminder', 'NN'),
 ('on', 'IN'),
 ('4', 'CD'),
 ('th', 'JJ'),
 ('Dec', 'NNP'),
 ('of', 'IN'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('meet', 'VB'),
 ('sonal', 'JJ'),
 ('miss', 'NNS'),
 ('at', 'IN'),
 ('2:00', 'CD'),
 ('pm', 'NN')]

In [365]:
# Nouns and Verb type words can be considered as "important words"
Imp_words = [w[0] for w in tagged if w[1].startswith('N') or w[1].startswith('V')]
Imp_words

['Set', 'reminder', 'Dec', 'going', 'meet', 'miss', 'pm']

### Checking out different n-grams possible for a statement.

In [366]:
from nltk.util import ngrams

n=5
for i in range(1,n+1):
    output = list(ngrams(tokens, i))
    print (output,"\n")

[('Set',), ('a',), ('reminder',), ('on',), ('4',), ('th',), ('Dec',), ('of',), ('going',), ('to',), ('meet',), ('sonal',), ('miss',), ('at',), ('2:00',), ('pm',)] 

[('Set', 'a'), ('a', 'reminder'), ('reminder', 'on'), ('on', '4'), ('4', 'th'), ('th', 'Dec'), ('Dec', 'of'), ('of', 'going'), ('going', 'to'), ('to', 'meet'), ('meet', 'sonal'), ('sonal', 'miss'), ('miss', 'at'), ('at', '2:00'), ('2:00', 'pm')] 

[('Set', 'a', 'reminder'), ('a', 'reminder', 'on'), ('reminder', 'on', '4'), ('on', '4', 'th'), ('4', 'th', 'Dec'), ('th', 'Dec', 'of'), ('Dec', 'of', 'going'), ('of', 'going', 'to'), ('going', 'to', 'meet'), ('to', 'meet', 'sonal'), ('meet', 'sonal', 'miss'), ('sonal', 'miss', 'at'), ('miss', 'at', '2:00'), ('at', '2:00', 'pm')] 

[('Set', 'a', 'reminder', 'on'), ('a', 'reminder', 'on', '4'), ('reminder', 'on', '4', 'th'), ('on', '4', 'th', 'Dec'), ('4', 'th', 'Dec', 'of'), ('th', 'Dec', 'of', 'going'), ('Dec', 'of', 'going', 'to'), ('of', 'going', 'to', 'meet'), ('going', 'to', 

## Noun -Phrase Chunking

In [379]:
# Defining a function which takes a sentence as an input and returns important phrase

def extract(sentence):
    
    words = nltk.word_tokenize(sentence)
    nltk.pos_tag(words)
    
    # defining a chunk grammar, consisting of rules that indicate how sentences should be chunked.
    # NP chunk should be formed whenever the chunker finds optional verb type(VB), followed by optional RB, 
    # folllowed by Personal pronoun types(PRP), followed by optional Preposition(IN) , followed by  an optional determiner (DT)
    # followed by any number of adjectives (JJ) and then a noun (NN). 
    
    grammar = "NP: {<VB.*>?<RB>?<PRP.*>?<IN>?<DT>?<JJ.*>*<NN.*>+}"
    
    # Using this grammar, we create a chunk parser
    parser = nltk.RegexpParser(grammar)
    
    # Test it on our example sentence
    t = parser.parse(nltk.pos_tag(words))
    
    # Result is a tree 
    a = [s for s in t.subtrees() if s.label() == "NP"]
    
    c = []
    num = []
    
    # These keywords were not included as label in training dat, so don't consider here also
    key  = ["monday","tuesday", "wednesday", "thursday","friday","saturday","sunday","today","tomorrow","yesterday", "reminder", "remind", "th", "pm","am"]
    
    for i in range(len(a)):
        count=0
        phrase = ""
        for j in range(len(a[i])):
            if a[i][j][0].lower() in key:
                phrase = phrase
            else :
                phrase = phrase + str(a[i][j][0]) + " "
                count = count+1
        c.append(phrase)
        num.append(count)
        #print (c)
        #print (num)
    
    if(c==[] or max(num)<=1):
        return "Not Found"
    else :
        maxi = max(num)
        for i in range(len(num)):
            if(num[i]==maxi):
                return c[i].rstrip()

### Testing fuction and its output

In [380]:
print(sentence,"\n") 
print("Phrase  :   ", extract(sentence))

Set a reminder on 4 th Dec of going to meet sonal miss at 2:00 pm 

Phrase  :    meet sonal miss


### Reading txt file which needs to be evaluated

In [381]:
# Reading file line by line 
with open("eval_data.txt", 'r+') as f:
    lines = [line.rstrip('\n') for line in f]
    
print (lines[67])

Please remind me after 2 .00 pm for today


### Creating new "eval_data.csv" to store.

In [382]:
with open('eval_data.csv', mode='w', newline='') as csv_file:
    fieldnames = ['sent', 'label']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    writer.writeheader()
    for i in range(len(lines)):
        writer.writerow({'sent':lines[i],'label':extract(lines[i])})

### Checking Accuracy of the model on training set

In [389]:
with open('eval_data2.csv', mode='w', newline='', encoding = 'utf-8') as csv_file:
    fieldnames = ['sent', 'Given_label', 'Predicted_label']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    writer.writeheader()
    count = 0
    for i in range(len(data)):
        writer.writerow({'sent':data['sent'][i], 'Given_label':data['label'][i], 'Predicted_label':extract(str(data['sent'][i]))})
        
        if str(data['label'][i]) == extract(str(data['sent'][i])):
            count = count+1
            
print ("Accuracy : ", (count/len(data))*100, "%")

Accuracy :  31.805682859761685 %
