In [1]:
import re
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
import os
import rake_nltk
from rake_nltk import Rake
import nltk
# from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

# use wordnet find the first level keywords
from nltk.corpus import wordnet as wn

**********************************************************************************************
## Load data. 
### Merge sample homes and their reviews into one dataframe.

In [2]:
# Load Necessary Data: reviews
reviews_df = pd.read_csv("./Data/reviews.csv", encoding="utf-8")
reviews_df.columns = ['home_id', 'review_id', 'date', 'reviewer_id', 'reviewer_name', 'comments']
reviews_df.dropna()
reviews_df.head(2)

Unnamed: 0,home_id,review_id,date,reviewer_id,reviewer_name,comments
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...


In [3]:
sample1_df = pd.read_csv("./Data/sample_data_for_testing", sep='\t', encoding="utf-8")
sample1_df = sample1_df.drop("Unnamed: 0", axis=1)
sample1_df.head(2)

Unnamed: 0,home_id,property_type,room_type,price,number_of_reviews,scores_overall_rating,scores_accuracy,scores_cleanliness,scores_checkin,scores_communication,scores_location,scores_value,price_int
0,241032,Apartment,Entire home/apt,$85.00,207,95.0,10.0,10.0,10.0,10.0,9.0,10.0,85
1,953595,Apartment,Entire home/apt,$150.00,43,96.0,10.0,10.0,10.0,10.0,10.0,10.0,150


In [4]:
# Merge the reviews and homes in the sample data.
df1 = sample1_df[['home_id', 'scores_cleanliness']]
df2 = reviews_df[['home_id', 'review_id', 'comments']]

print("-" * 40 
      + '\nTotal number of reviews: ' 
      + str(sample1_df['number_of_reviews'].sum()) 
      + "\n" + "-" * 40)

sample1_rh_df = pd.merge(df1, df2, on="home_id")
sample1_rh_df.head(3)

# sample1_rh_df.stack()[0].comments

----------------------------------------
Total number of reviews: 68638
----------------------------------------


Unnamed: 0,home_id,scores_cleanliness,review_id,comments
0,241032,10.0,682061,Excellent all the way around. \r\n\r\nMaija wa...
1,241032,10.0,691712,Maija's apartment was a wonderful place to sta...
2,241032,10.0,702999,one of the most pleasant stays i've had in my ...


**********************************************************************************************
## Analyze `cleanliness` aspect
### Use the selected aspect keywords to analyze reviews 
**********************************************************************************************
### Brief summary

In [5]:
# sample2_rh_df is a copy of sample1_rh_df to 
# in case unexpected modification for original data.
sample2_rh_df = sample1_rh_df
print('*' * 40 + '\nThere are:\n' + '-' * 40)
print(str(len(sample2_rh_df.groupby('home_id'))) + " Airbnb homes in total.\n" + '-' * 40)
print(str(len(sample2_rh_df)) + " reviews in total.\n" + '-' * 40)

****************************************
There are:
----------------------------------------
2225 Airbnb homes in total.
----------------------------------------
68638 reviews in total.
----------------------------------------


In [6]:
# group by scores_cleanliness
df1 = pd.DataFrame(sample2_rh_df.groupby(['scores_cleanliness'])['home_id'].nunique())
df2 = pd.DataFrame(sample2_rh_df.groupby(['scores_cleanliness'])['review_id'].nunique())
summary_df = pd.merge(df1, df2, on = 'scores_cleanliness')
summary_df.columns = ['number of homes', 'number of reveiws']
summary_df

Unnamed: 0_level_0,number of homes,number of reveiws
scores_cleanliness,Unnamed: 1_level_1,Unnamed: 2_level_1
3.0,1,2
4.0,4,9
5.0,3,17
6.0,23,86
7.0,20,236
8.0,113,2813
9.0,512,16571
10.0,1549,48904


**********************************************************************************************
### Chose one cleanliness score to do further analysis.
#### Here I choose Airbnb homes with cleanliness score as `8.0` to do the "pliot test". 
#### As in above table, there are `113` homes with `2813` reviews in total.

In [108]:
sample2_rh_df[sample2_rh_df.scores_cleanliness == 8.0].groupby(['home_id'])['comments'].count().sort_values(ascending=False).head(10)

home_id
258571     278
719233     255
882274     189
1039766    147
1815304    139
1950446    136
815017     136
1100714    134
1773803    124
3316219    106
Name: comments, dtype: int64

*******
### Here, the logic of determine the aspect that a sentence talking about is:

* `aspect_keywords_dic` is a dictionary contains aspects and their relevant keywords.**

* Firstly, I use the Word2Vec similarity algorithm to count the similarity score between two words.

* For each sentence, the aspect-similarity-score will `+1` when one word in it has the word-similarity-score with any word in keyword list lager than `0.7`(this number can change based on needs). 

* I tried use the total scores of all vectors(word-in-sentence to word-in-keywords), however, the results are very strange.

In [64]:
nlp = spacy.load('en_core_web_md')

In [98]:
aspect_keywords_dic = {
    'location': ['region', 'locality', 'neck_of_the_woods', 'location', 'vicinity',
                 'neighbourhood', 'neighborhood'],
    'cleanliness': ['tidy_up', 'straighten_out', 'cleanliness', 'clean', 
                    'neaten', 'square_away', 'straighten', 'clean_house', 'make_clean', 
                    'tidy', 'houseclean', 'clean_up', 'scavenge',
                   'soiled', 'unclean', 'colly', 'bemire', 'uncleanliness', 'soil', 'begrime',
                    'grime', 'untidy', 'dirty']
}

def sents_sim_score(doc, aspect, sim):
    
    sent_asp_sim_score_df = pd.DataFrame(columns = ['sentence_id', 'aspect_sim_score', 'text'])
    
    aspect_keywords = nlp(' '.join(aspect_keywords_dic[aspect]))

    # I tried use the total scores of all vectors, however, the results are very strange.
    # So, I tried to use the count of the words in a sentance 
    # that with lager than 0.7 similarity score to determine the relevance.
    
    for which_sen in range(len(list(doc.sents))):
        
        new_doc = list(doc.sents)[which_sen].text        
        sen_keywords = nlp(new_doc)
        aspect_sim_score = compute_score(sen_keywords, aspect_keywords, sim)
        sent_asp_sim_score_df.loc[which_sen] = [which_sen, aspect_sim_score, new_doc]
    
    return sent_asp_sim_score_df

def compute_score(sen_keywords, aspect_keywords, sim):
    count = 0
    for token1 in sen_keywords:
        for token2 in aspect_keywords:
            if token1.similarity(token2) >= sim:
                count += 1 
    return count

In [106]:
def summary_of_homes_with_score(score, aspect):
    
    score_dic = {}
    score_df = sample2_rh_df[sample2_rh_df.scores_cleanliness == score]
    score_series = score_df.groupby(['home_id'])['review_id'].count()
    
    summary_df = pd.DataFrame(columns=['home_id','num_of_reviews', 'num_of_sents', 
                                           'aspect', 'num_of_sents_0.5', 
                                           'num_of_sents_0.6', 'num_of_sents_0.7'])

    for i in range(len(score_series)):
#     for i in range(4):
        
        print("home" + str(i))
        home_id = score8_series.index[i]
        num_of_reviews = score_series.values[i]
        num_of_sents = count_sents(home_id)[0]
        num_of_sents_05 = home_sents_process(home_id, aspect, 0.5)[0]
        num_of_sents_06 = home_sents_process(home_id, aspect, 0.6)[0]
        num_of_sents_07 = home_sents_process(home_id, aspect, 0.7)[0]

        summary_df.loc[i] = [home_id, num_of_reviews, num_of_sents, aspect, num_of_sents_05, 
                             num_of_sents_06, num_of_sents_07]

    return summary_df

def count_sents(home_id):
    # all comments together
    comments = ''.join(list(sample2_rh_df[sample2_rh_df.home_id == home_id].comments))
    doc = nlp(comments)
    
    return len(list(doc.sents)), doc
    
def home_sents_process(home_id, aspect, sim):
    
    doc = count_sents(home_id)[1]
    sents_score_dic = sents_sim_score(doc, aspect, sim)
    home_asp_sents = sents_score_dic[sents_score_dic['aspect_sim_score'] != 0].text
    
    return len(home_asp_sents), home_asp_sents

### Let's see the result of homes with cleanliness score `8.0`

In [109]:
score8_homes_reviews = summary_of_homes_with_score(8.0, 'cleanliness')
score8_homes_reviews

home0
home1
home2
home3
home4


KeyboardInterrupt: 

**********************************************************************************************
### Try a small sample: home 258571 which has 278 number of comments.

In [10]:
comment258571 = ''.join(list(sample2_rh_df[sample2_rh_df.home_id == 258571].comments))
doc = nlp(comment258571)

print("-" * 80 +
      "\nThere are in total " 
      + str(len(list(doc.sents))) 
      + " sentences in these 278 comments.\n" 
      + "-" * 80)

--------------------------------------------------------------------------------
There are in total 1478 sentences in these 278 comments.
--------------------------------------------------------------------------------


In [104]:
home258571_sent_score = sents_sim_score(doc, 'cleanliness', 0.7)
home258571_sent_scxore.head(5)
# home258571_cleanliness_text = [home258571_sent_score[k]['sentence'] for k in home258571_sent_score.keys() if home258571_sent_score[k]['aspect_sim_score'] != 0]

Unnamed: 0,sentence_id,aspect_sim_score,text
0,0,0,The apartment was exactly what we had expected...
1,1,0,"right in the heart of Capitol Hill, just off B..."
2,2,0,"Nick was prompt and very friendly, reachable e..."
3,3,1,The apartment was clean and quiet.
4,4,0,I would certainly stay here again.


In [105]:
a = home258571_sent_score[home258571_sent_score['aspect_sim_score'] != 0].text
len(a)

91

*******************

### Result summary

In [29]:
len(home258571_cleanliness_text)

627

In [30]:
print("-" * 80
      + "\nThere are " 
      + str(len(home258571_cleanliness_text))
      + "/"
      + str(len(list(doc.sents)))
      + " sentences talking about the cleanliness aspect.\n"
      + "-" * 80)

--------------------------------------------------------------------------------
There are 627/1478 sentences talking about the cleanliness aspect.
--------------------------------------------------------------------------------


In [26]:
home258571_cleanliness_text

['The apartment was clean and quiet.  ',
 'Great host, comfy, clean room with all you need in the most buzzing part of Seattle.  ',
 'Overall, incredibly good value and highly recommended!!The apartment was clean and really nice.  ',
 'Thanks!Nick does his best to provide the things you would need for a comfortable stay (clean sheets, towels, plates, wifi!!).',
 'Even though its at the back of the building and the giant living room windows look out onto an odd stone alley, the alley is clean and there is something very zen about the view from the living room.  ',
 'the kitchen needed a good scrubbing - it was dirty.  ',
 'The apartment was a little on the small side - so probably better for just 2 people - but it was clean and in a great neighborhood.',
 'Super clean and convenient.',
 'Comfortable, clean and quiet the apartment had just about everything you could ask for.  ',
 'The apartment was clean and comfortable, and you absolutely cannot beat the location!',
 'The apartment was 

**********************************************************************************************

In [19]:
def score_detail(sen_keywords, aspect_keywords):
    score = 0
    for token1 in sen_keywords:
        for token2 in aspect_keywords:
            if token1.similarity(token2) >= 0.7:
                print(token1.text, token2.text, token1.similarity(token2))
                score += 1
    return score

In [20]:
h = nlp(u"Nick was very communicative and gracious enough to transport me to and from the light rail upon my arrival and departure, and went the extra mile by leaving me maps, recommendations, and even printed out a bus route based on where I'd told him I planned on going.")
aspect_keywords = nlp(' '.join(aspect_keywords_dic['cleanliness']))

# print(len(sen_keywords), len(aspect_keywords))
score_detail(h, aspect_keywords)

0

In [21]:
def word_similarity(w1, w2):
    token1 = nlp(w1)
    token2 = nlp(w2)
    return token1.similarity(token2)

word_similarity('scrubbing', 'clean')

0.648041676443703

In [22]:
# print("Brief pre-processing logic for one sentence.")
# print("*" * 93 + "\nAn example:\n" + "-" * 80 + "\nThe first sentence:")
# print(list(doc.sents)[0].text)

# print("-" * 93 + "\nSentence keywords got from keyword_extraction function:")
# sen_keywords = list(keyword_extraction(list(doc.sents)[0].text).keys())
# print(sen_keywords)

# print("-" * 93 + "\nJoin sentence keywords together for further similarity use:")
# sen_keywords = ' '.join(sen_keywords)
# print(sen_keywords)
# print("-" * 93)

# **So, temporarily, I tried to use the count of the words in a sentance 
# that with lager than `0.7 `similarity score to determine the relevance.
# And it works well for now.**