#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy 
import en_core_web_sm
nlp = en_core_web_sm.load()
from nltk.tokenize import sent_tokenize 
from nltk.stem import WordNetLemmatizer 

#### Import data

In [2]:
reviews_df = pd.read_csv("reviews.csv")
reviews_df["noun_adj"] = ""
reviews_df = reviews_df[["reviews","noun_adj"]]
reviews_df.head()

Unnamed: 0,reviews,noun_adj
0,"Taking an Asian tour, we visited Singapore- wh...",
1,We enjoyed our stay at 35th floor garden view!...,
2,"Room is spacious, clean and had a great view o...",
3,We managed to time this just right as the sun ...,
4,We stayed here for the access to the rooftop p...,


#### Drop Empty rows

In [3]:
reviews_df['reviews'].replace('', np.nan, inplace=True)
reviews_df.dropna(subset=['reviews'], inplace=True)

#### Function to find the nearest noun

To find the nearest noun from the input index

In [4]:
def nearest_noun(array, index):
    min_dist = 1000000
    min_index = None
    
    for i in range(len(array)):
        if array[i][1] in ('NOUN',"PROPN"):
            difference = abs(i-index)
            if difference <= min_dist:
                min_dist = difference
                min_index = i
                
    if min_index == None:
        return None
    else:
        return array[min_index][0]

#### Function to find the nearest adjective

To find the nearest adjective from the input index

In [5]:
def nearest_adj(array, index):
    min_dist = 1000000
    min_index = None
    
    for i in range(index, len(array)):
        if array[i][1] == "ADJ":
            difference = abs(i-index)
            if difference <= min_dist:
                min_dist = difference
                min_index = i
                
    if min_index == None:
        return None
    else:
        return min_index

#### Function lemmatize the adjective-noun pairs 

To find the nearest adjective from the input index

In [6]:
def lemmatize_pairs(array):
    lemmatizer = WordNetLemmatizer() 
    result = []
    for pair in array:
        adj = ' '.join([lemmatizer.lemmatize(word, pos = 'a') for word in pair[0].split(' ')])
        noun = ' '.join([lemmatizer.lemmatize(word, pos = 'n') for word in pair[1].split(' ')])
        result.append((adj,noun))
    return result

In [7]:
def noun_adj(paragraph):
    sentences = sent_tokenize(paragraph) 
    noun_adj_pairs = []
    for sentence in sentences:
        
        #Initialise
        doc = nlp(sentence)
        noun = ''
        adj = ''
        pos_array = []
        
        # Create array of POS Tags
        for index,token in enumerate(doc): 
            pos_array.append((str(token).lower(),token.pos_))
            
        index = 0
        while index != len(pos_array):
            token = pos_array[index][0]
            tag = pos_array[index][1]
            
            # If the word at index is determiner, word at index+2 is noun 
            # and word at index+1 is not determiner or propernoun

            if tag == "DET" and index < len(pos_array)-2:
                if (pos_array[index+1][1] not in ("DET","PROPN")) and pos_array[index+2][1]=="NOUN":
                    adj = pos_array[index+1][0]
                    temp_noun = ''
                    temp_index = index+2
                    while temp_index != len(pos_array): # Chain the consecutive nouns
                        if pos_array[temp_index][1] == 'NOUN':
                            temp_noun = temp_noun + ' ' + pos_array[temp_index][0]
                            temp_index = temp_index + 1
                        else:
                            break
                    noun_adj_pairs.append((adj,temp_noun.strip()))
                    index = temp_index

                    #Check if an Aux follows a noun
                    if pos_array[index-1][1] == 'NOUN' and index < len(pos_array)-1:
                        if pos_array[index][1]=="AUX":
                            adj_index = nearest_adj(pos_array, index)
                            if adj_index: 
                                noun_adj_pairs.append((pos_array[adj_index][0],pos_array[index-1][0]))
                                if adj_index > index:
                                    index = adj_index + 1
                else:
                    index = index + 1
            
            # If there is an adjective tag
            elif tag == "ADJ":
                if index < len(pos_array)-1 and pos_array[index+1][0] == "to":
                    index = index + 1
                    
                elif index < len(pos_array)-1 and pos_array[index+1][1] == "NOUN":
                    adj = token
                    temp_noun = ''
                    temp_index = index+1
                    while temp_index != len(pos_array):
                        if pos_array[temp_index][1] == 'NOUN':
                            temp_noun = temp_noun + ' ' + pos_array[temp_index][0]
                            temp_index = temp_index + 1
                        else:
                            break
                    noun_adj_pairs.append((adj,temp_noun.strip()))
                    index = temp_index
                        
                else:
                    noun = nearest_noun(pos_array,index)
                    if noun and token:
                        noun_adj_pairs.append((token,noun))
                    index = index + 1
            else:
                index = index + 1
    return noun_adj_pairs

#### Get the noun adjective pairs of the reviews

In [8]:
total_pairs = []
for index,row in reviews_df.iterrows():
    paragraph = row["reviews"]
    noun_adj_pairs = noun_adj(paragraph)
    pairs = lemmatize_pairs(noun_adj_pairs)
    for pair in pairs:
        total_pairs.append(pair)
    reviews_df.loc[index]["noun_adj"] = pairs

#### Get the top count of the pairs

In [9]:
count_pairs = {}
for pair in total_pairs:
    if pair in count_pairs:
        count_pairs[pair] = count_pairs[pair] + 1
    else:
        count_pairs[pair] = 1

In [10]:
sorted_pairs = sorted(count_pairs.items(), key=lambda x: x[1], reverse=True)

In [11]:
print('{0:^3}{1:^50}{2:^15}'.format("","Pair","Count"))
index = 1
for pair in sorted_pairs:
    print('{0:^3}{1:^50}{2:^15}'.format(str(index),str(pair[0]),str(pair[1])))
    index = index + 1

                          Pair                            Count     
 1                ('spacious', 'room')                      3       
 2                 ('clean', 'room')                        3       
 3                ('rooftop', 'pool')                       3       
 4               ('incredible', 'view')                     2       
 5                 ('great', 'time')                        2       
 6                 ('nice', 'hotel')                        2       
 7                  ('good', 'view')                        2       
 8                  ('nice', 'room')                        2       
 9                 ('other', 'hotel')                       2       
10              ('customer', 'service')                     2       
11                 ('total', 'waste')                       2       
12             ('master', 'access card')                    2       
13               ('security', 'guard')                      2       
14               ('incident', 'rep

In [12]:
reviews_df.head()

Unnamed: 0,reviews,noun_adj
0,"Taking an Asian tour, we visited Singapore- wh...","[(asian, tour), (incredible, view), (incredibl..."
1,We enjoyed our stay at 35th floor garden view!...,"[(35th, floor garden view), (high, floor), (gr..."
2,"Room is spacious, clean and had a great view o...","[(spacious, room), (clean, room), (great, view..."
3,We managed to time this just right as the sun ...,"[(sure, hand), (left, hand side), (viewing, pl..."
4,We stayed here for the access to the rooftop p...,"[(rooftop, pool), (massive, hotel), (worth, ac..."


#### Save to csv file

In [13]:
reviews_df.to_csv("reviews_spacy.csv",index=False)

### Test the program

 Run the cell below
 
 example input: This room is great. The view is beautiful.

In [12]:
#run this your text here
paragraph = input("Enter your paragraph: ")
print("="*50)
print("Extracted (adjective noun) pairs below:")
print(noun_adj(paragraph))

Enter your paragraph: This room is great. The view is beautiful
Extracted (adjective noun) pairs below:
[('great', 'room'), ('beautiful', 'view')]
