In [None]:
from google.colab import drive
drive.mount('/content/drive')
# %cd /content/drive/My Drive/Ntu/nndl/CZ4042 Final Project

%cd /content/drive/My Drive/CZ4034 Information Retrieval/simulate

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1VPuFOkV6y3xuglvuLABKy8nlU6W8xStE/CZ4034 Information Retrieval/simulate


In [None]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [None]:
import pandas as pd
import re
import numpy as np
import spacy 
import en_core_web_md
nlp = en_core_web_md.load()
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from collections import Counter
from pathlib import Path
from wordcloud import WordCloud
import warnings

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
hotel_info_path = 'data/hotel/hotel_info.csv'
hotel_reviews_path = 'data/reviews/hotel_reviews.csv'
wordcloud_path = 'wordcloud'

In [None]:
# cleans input text.
def text_cleaner(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = re.sub(r'^br$', ' ', text)
    text = re.sub(r'\s+br\s+',' ',text)
    text = re.sub(r'\s+[a-z]\s+', ' ',text)
    text = re.sub(r'^b\s+', '', text)
    text = re.sub(r'\s+', ' ', text)

    text_tokens = word_tokenize(text)
    stop_words = set(stopwords.words())
    tokens_without_sw = [word for word in text_tokens if not word in stop_words]

    filtered_sentence = (" ").join(tokens_without_sw)

    return text
  
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# function to get nouns
def spacy_nouns(text):
  sentences = sent_tokenize(text) 
  nouns = []
  for sentence in sentences:
    #Initialise
    doc = nlp(sentence)
    for chunk in doc.noun_chunks:
      adjectives = []
      noun = ""
      for tok in chunk:
          if tok.pos_ == "NOUN":
              nouns.append(lemmatizer.lemmatize(tok.text.lower()))
  return nouns

# similarity between words
def get_similarity(word1, word2):
    tokens = nlp('{} {}'.format(word1, word2))

    token1, token2 = tokens[0], tokens[1]
    return token1.similarity(token2)

def get_adjectives_textblob(text):
  blob = TextBlob(text)
  jj_rb_list = [ word for (word,tag) in blob.tags if tag == "JJ"]
  return jj_rb_list

In [None]:
def nouns_analyser(hotel_info_path, hotel_reviews_path, wordcloud_path):
  warnings.filterwarnings("ignore", message=r"\[W008\]", category=UserWarning)

  print("Data preprocessing...")
  # Get df
  df = pd.read_csv(hotel_reviews_path)

  # Get postive negative reviews
  df = df[['hotel_name', 'review_pos', 'review_neg']]
  df_neg_reviews = df[['hotel_name', 'review_neg']].rename(columns={'review_neg':'review'}).copy()
  df_pos_reviews = df[['hotel_name', 'review_pos']].rename(columns={'review_pos':'review'}).copy()

  # Drop NA
  df_neg_reviews.dropna(inplace = True)
  df_pos_reviews.dropna(inplace = True)

  # strips white spaces from front and back of text followed by spilting words into an array
  df_neg_reviews['word_count'] = df_neg_reviews['review'].apply(lambda x: len(x.strip().split(' ')))
  df_pos_reviews['word_count'] = df_pos_reviews['review'].apply(lambda x: len(x.strip().split(' ')))

  word_len_threshold = 5 # threshold for the minimum number of words in a sentence/document
  df_pos_reviews_cleaned = df_pos_reviews[df_pos_reviews['word_count'].apply(lambda x: x > word_len_threshold)] # filters sentences/
  df_neg_reviews_cleaned = df_neg_reviews[df_neg_reviews['word_count'].apply(lambda x: x > word_len_threshold)] # filters sentences/docs

  # Clean text
  df_neg_reviews['review'] = df_neg_reviews['review'].apply(text_cleaner)
  df_pos_reviews['review'] = df_pos_reviews['review'].apply(text_cleaner)

  # Create neg and pos dictionaries
  hotel_names_pos = df_pos_reviews_cleaned['hotel_name'].drop_duplicates().values.tolist()
  hotel_reviews_pos_dict = {}

  for hotel_name in hotel_names_pos:
    query = df_pos_reviews_cleaned[df_pos_reviews_cleaned.hotel_name == hotel_name]
    reviews = query['review'].values.tolist()
    hotel_reviews_pos_dict[hotel_name] = reviews

  hotel_names_neg = df_neg_reviews_cleaned['hotel_name'].drop_duplicates().values.tolist()
  hotel_reviews_neg_dict = {}

  for hotel_name in hotel_names_neg:
    query = df_neg_reviews_cleaned[df_neg_reviews_cleaned.hotel_name == hotel_name]
    reviews = query['review'].values.tolist()
    hotel_reviews_neg_dict[hotel_name] = reviews

  # Extract top nouns for each hotel
  print("Extracting best and worst nouns...")
  top_k = 3

  hotel_nouns_pos_dict = {}
  for hotel_name in hotel_names_pos:
    pos_reviews = hotel_reviews_pos_dict[hotel_name]
    top_nouns = []
    for review in pos_reviews:
      top_nouns.extend(spacy_nouns(review))

    nouns_count = Counter(top_nouns).most_common()
    nouns_sorted = [i[0] for i in nouns_count]
    hotel_nouns_pos_dict[hotel_name] = nouns_sorted

  hotel_nouns_neg_dict = {}
  for hotel_name in hotel_names_neg:
    neg_reviews = hotel_reviews_neg_dict[hotel_name]
    top_nouns = []
    for review in neg_reviews:
      top_nouns.extend(spacy_nouns(review))

    nouns_count = Counter(top_nouns).most_common()
    nouns_sorted = [i[0] for i in nouns_count]
    hotel_nouns_neg_dict[hotel_name] = nouns_sorted

  hotel_concat = {'hotel_name': [], 'best_noun': [], 'worst_noun': []}
  all_hotel_names = list(set(list(hotel_names_pos) + list(hotel_names_neg)))

  for hotel_name in all_hotel_names:
    if (hotel_name in hotel_names_pos) and (hotel_name in hotel_names_neg):
      hotel_concat['hotel_name'].append(hotel_name)
      best_nouns = hotel_nouns_pos_dict[hotel_name][:top_k]
      neg_nouns = hotel_nouns_neg_dict[hotel_name]
      worst_nouns = []

      count = top_k
      while count != 0:
        if len(neg_nouns) == 0:
          break
        if neg_nouns[0] not in best_nouns:
          worst_nouns.append(neg_nouns[0])
          count = count - 1
        neg_nouns = neg_nouns[1:]
      
      hotel_concat['worst_noun'].append(','.join(worst_nouns))
      hotel_concat['best_noun'].append(','.join(best_nouns))
    
    elif (hotel_name in hotel_names_pos):
      hotel_concat['hotel_name'].append(hotel_name)
      hotel_concat['best_noun'].append(','.join(hotel_nouns_pos_dict[hotel_name][:top_k]))
      hotel_concat['worst_noun'].append(' ')

    elif (hotel_name in hotel_names_neg):
      hotel_concat['hotel_name'].append(hotel_name)
      hotel_concat['best_noun'].append(' ')
      hotel_concat['worst_noun'].append(','.join(hotel_nouns_neg_dict[hotel_name][:top_k]))

  # df for best and worst nouns
  df_best_worst_nouns = pd.DataFrame(data=hotel_concat)

  # Extract predefined filters for each hotel
  print("Extracting top features...")
  predefined_nouns = ['room', 'facilities', 'location', 'staff', 'food']
  pos_nouns_count = {key:{'room': 0, 'facilities': 0, 'location': 0, 'staff': 0, 'food': 0} for (key,value) in hotel_nouns_pos_dict.items()}

  # Extracting Adjectives for wordcloud
  similarity_threshold = 0.5
  for hotel_name, nouns in hotel_nouns_pos_dict.items():
    for noun in nouns:
      for predefined_noun in predefined_nouns:
        # Check if same noun
        if noun == predefined_noun:
          pos_nouns_count[hotel_name][predefined_noun] = pos_nouns_count[hotel_name][predefined_noun] + 1
        # If not same, compute similarity
        elif get_similarity(noun, predefined_noun) > similarity_threshold:
          pos_nouns_count[hotel_name][predefined_noun] = pos_nouns_count[hotel_name][predefined_noun] + 1

  neg_nouns_count = {key:{'room': 0, 'facilities': 0, 'location': 0, 'staff': 0, 'food': 0} for (key,value) in hotel_nouns_neg_dict.items()}

  for hotel_name, nouns in hotel_nouns_neg_dict.items():
    for noun in nouns:
      for predefined_noun in predefined_nouns:
        # Check if same noun
        if noun == predefined_noun:
          neg_nouns_count[hotel_name][predefined_noun] = neg_nouns_count[hotel_name][predefined_noun] + 1
        # If not same, compute similarity
        elif get_similarity(noun, predefined_noun) > similarity_threshold:
          neg_nouns_count[hotel_name][predefined_noun] = neg_nouns_count[hotel_name][predefined_noun] + 1

  #room': 0, 'facilities': 0, 'location': 0, 'staff': 0, 'food'
  hotel_names_pos = pos_nouns_count.keys()
  hotel_names_neg = neg_nouns_count.keys()

  hotel_concat = {'hotel_name': [], 'features_room': [], 'features_facilities': [], 'features_location': [], 'features_staff': [], 'features_food': []}
  all_hotel_names = list(set(list(hotel_names_pos) + list(hotel_names_neg)))

  for hotel_name in all_hotel_names:
    hotel_concat['hotel_name'].append(hotel_name)
    if (hotel_name in hotel_names_pos) and (hotel_name in hotel_names_neg):
      if pos_nouns_count[hotel_name]['room'] > neg_nouns_count[hotel_name]['room']:
        hotel_concat['features_room'].append(1)
      else:
        hotel_concat['features_room'].append(0)

      if pos_nouns_count[hotel_name]['facilities'] > neg_nouns_count[hotel_name]['facilities']:
        hotel_concat['features_facilities'].append(1)
      else:
        hotel_concat['features_facilities'].append(0)

      if pos_nouns_count[hotel_name]['location'] > neg_nouns_count[hotel_name]['location']:
        hotel_concat['features_location'].append(1)
      else:
        hotel_concat['features_location'].append(0)

      if pos_nouns_count[hotel_name]['staff'] > neg_nouns_count[hotel_name]['staff']:
        hotel_concat['features_staff'].append(1)
      else:
        hotel_concat['features_staff'].append(0)

      if pos_nouns_count[hotel_name]['food'] > neg_nouns_count[hotel_name]['food']:
        hotel_concat['features_food'].append(1)
      else:
        hotel_concat['features_food'].append(0)
    
    elif (hotel_name in hotel_names_pos):
      hotel_concat['features_room'].append(1)
      hotel_concat['features_facilities'].append(1)
      hotel_concat['features_location'].append(1)
      hotel_concat['features_staff'].append(1)
      hotel_concat['features_food'].append(1)


    elif (hotel_name in hotel_names_neg):
      hotel_concat['features_room'].append(0)
      hotel_concat['features_facilities'].append(0)
      hotel_concat['features_location'].append(0)
      hotel_concat['features_staff'].append(0)
      hotel_concat['features_food'].append(0)

  # df for predefined filters
  df_predefined_filters = pd.DataFrame(data=hotel_concat)

  # Change hotel info file
  df_hotel_info = pd.read_csv(hotel_info_path)
  df_hotel_info = df_hotel_info.join(df_best_worst_nouns.set_index('hotel_name'), on='hotel_name').reset_index(drop=True)

  df_hotel_info = df_hotel_info.join(df_predefined_filters.set_index('hotel_name'), on='hotel_name').reset_index(drop=True)
  df_hotel_info.to_csv(hotel_info_path, index=False)

  # Generate word cloud
  df_reviews = pd.concat([df_neg_reviews, df_pos_reviews])

  df_reviews = df_reviews[['hotel_name', 'review']]
  hotel_names = df_reviews['hotel_name'].drop_duplicates().values.tolist()

  # Create Directories
  print("Creating wordcloud directories...")
  for hotel in hotel_names:
    hotel = hotel.replace('/','')
    hotel = hotel.replace(':', '')
    Path('{}/{}'.format(wordcloud_path, hotel)).mkdir(parents=True, exist_ok=True)
    
  # Extracting Adjectives for wordcloud
  similarity_threshold = 0.5
  hotel_reviews_adj_dict = {key:{'room': [], 'facilities': [], 'location': [], 'staff': [], 'food': []} for key in hotel_names}
  predefined_nouns = ['room', 'facilities', 'location', 'staff', 'food']
  for hotel_name in hotel_names:
    query = df_reviews[df_reviews.hotel_name == hotel_name]
    reviews = query['review'].values.tolist()
    for review in reviews:
      words = review.split(' ')
      for predefined_noun in predefined_nouns:
        for word in words:
          # Check if same noun
          try:
            if (word == predefined_noun) or (get_similarity(word, predefined_noun) > similarity_threshold):

              # Add only the adjectives
              hotel_reviews_adj_dict[hotel_name][predefined_noun].append(' '.join(get_adjectives_textblob(review)))
              break
          except:
            continue

  print("Generating wordclouds...")
  for hotel_name, predefined_dict in hotel_reviews_adj_dict.items():
    for noun in predefined_nouns:
      text = ' '.join(predefined_dict[noun])

      if len(text) > 0:
        # Create and generate a word cloud image:
        wordcloud = WordCloud().generate(text)

        wordcloud.to_file('{}/{}/{}.png'.format(wordcloud_path, hotel_name, noun))

  print("Done!")
  return

In [None]:
nouns_analyser(hotel_info_path, hotel_reviews_path, wordcloud_path)