In [1]:
import re
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
import os
import nltk
# use wordnet find the first level keywords
from nltk.corpus import wordnet as wn
import spacy

In [None]:
# Load Necessary Data: reviews
reviews_df = pd.read_csv("./Data/reviews.csv", encoding="utf-8")
reviews_df.columns = ['home_id', 'review_id', 'date', 'reviewer_id', 'reviewer_name', 'comments']
reviews_df.dropna()
reviews_df.head(2)

sample1_df = pd.read_csv("./Data/selected_data_for_research.csv", sep='\t', encoding="utf-8")
sample1_df = sample1_df.drop("Unnamed: 0", axis=1)
sample1_df.head(2)
# Merge the reviews and homes in the sample data.
df1 = sample1_df[['home_id', 'scores_cleanliness', 'scores_location']]
df2 = reviews_df[['home_id', 'review_id', 'comments']]
sample1_rh_df = pd.merge(df1, df2, on="home_id")
sample1_rh_df.head(3)
# sample1_rh_df.stack()[0].comments

# sample2_rh_df is a copy of sample1_rh_df to 
# in case unexpected modification for original data.
sample2_rh_df = sample1_rh_df
print('*' * 40 + '\nThere are:\n' + '-' * 40)
print(str(len(sample2_rh_df.groupby('home_id'))) + " Airbnb homes in total.\n" + '-' * 40)
print(str(len(sample2_rh_df)) + " reviews in total.\n" + '-' * 40)

*******
### Here, the logic of determine the aspect that a sentence talking about is:

* `aspect_keywords_dic` is a dictionary contains aspects and their relevant keywords.

* Firstly, I use the Word2Vec similarity algorithm to count the similarity score between two words.

* For each sentence, the aspect-similarity-score will `+1` when one word in it has the word-similarity-score with any word in keyword list lager than `0.7`(this number can change based on needs). 

* I tried use the total scores of all vectors(word-in-sentence to word-in-keywords), however, the results are very strange.

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
aspect_keywords_dic = {
    'location': ['region', 'locality', 'neck_of_the_woods', 'location', 'vicinity',
                 'neighbourhood', 'neighborhood'],
    'cleanliness': ['tidy_up', 'straighten_out', 'cleanliness', 'clean', 
                    'neaten', 'square_away', 'straighten', 'clean_house', 'make_clean', 
                    'tidy', 'houseclean', 'clean_up', 'scavenge',
                   'soiled', 'unclean', 'colly', 'bemire', 'uncleanliness', 'soil', 'begrime',
                    'grime', 'untidy', 'dirty']
}

def sim_sents(doc, aspect, sim):
    sim_sents = []
    aspect_keywords = nlp(' '.join(aspect_keywords_dic[aspect]))
    
    for which_sen in range(len(list(doc.sents))):
        new_doc = list(doc.sents)[which_sen].text        
        sen_keywords = nlp(new_doc)
        
        if compute_score(sen_keywords, aspect_keywords, sim) > 0:
            sim_sents.append(new_doc)
    
    return sim_sents

def compute_score(sen_keywords, aspect_keywords, sim):
    count = 0
    for token1 in sen_keywords:
        for token2 in aspect_keywords:
            if token1.similarity(token2) >= sim:
                count += 1 
    return count

In [4]:
def asp_rlt_sents_of_homes(aspect, sample_df):
    
    score_dic = {}
    score_name = 'scores_'+aspect
    score_series = sample_df.groupby(['home_id'])['review_id'].count()
    summary_df = pd.DataFrame(columns=['home_id','num_of_reviews', 'num_of_sents',
                                       'aspect', 'num_of_sents_0.5', 'num_of_sents_0.6', 
                                       'num_of_sents_0.7', 'sents_0.5', 'sents_0.6', 'sents_0.7'])

    for i in range(len(score_series)):
        
        print("home" + str(i+1))
        home_id = score_series.index[i]
        num_of_reviews = score_series.values[i]
        num_of_sents = count_sents(home_id)[0]
        num_of_sents_05, sents_05 = home_sents_process(home_id, aspect, 0.5)
        num_of_sents_06, sents_06 = home_sents_process(home_id, aspect, 0.6)
        num_of_sents_07, sents_07 = home_sents_process(home_id, aspect, 0.7)

        summary_df.loc[i] = [home_id, num_of_reviews, num_of_sents, aspect, num_of_sents_05, 
                             num_of_sents_06, num_of_sents_07, sents_05, sents_06, sents_07]

    return summary_df

def count_sents(home_id):
    # all comments together
    comments_list = [x for x in list(sample2_rh_df[sample2_rh_df.home_id == home_id].comments) if isinstance(x, str)]
    comments = ''.join(comments_list)
    doc = nlp(comments)
    
    return len(list(doc.sents)), doc
    
def home_sents_process(home_id, aspect, sim):
    doc = count_sents(home_id)[1]
#     sents_score_dic = sents_sim_score(doc, aspect, sim)
#     home_asp_sents = sents_score_dic[sents_score_dic['aspect_sim_score'] != 0].text.values
    home_asp_sents = sim_sents(doc, aspect, sim)

    return len(home_asp_sents), home_asp_sents

### Let's see the result of homes with different cleanliness score.

In [5]:
# Load Data: hr means homes&reviews
loc_score10_hr_df = pd.read_csv("./Data/loc_score10_df.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1)
print(len(loc_score10_hr_df))
loc_score10_homes_sents = asp_rlt_sents_of_homes('location', loc_score10_hr_df)
loc_score10_homes_sents.to_csv('./Data/loc_hs_df_10.csv', sep='\t', encoding='utf-8')
loc_score10_homes_sents.head(3)

1466
home1


NameError: name 'sample2_rh_df' is not defined