In [7]:
import re
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
import os
import nltk
import spacy

*******
### Here, the logic of determine the aspect that a sentence talking about is:

* `aspect_keywords_dic` is a dictionary contains aspects and their relevant keywords.

* Firstly, I use the Word2Vec similarity algorithm to count the similarity score between two words.

* For each sentence, the aspect-similarity-score will `+1` when one word in it has the word-similarity-score with any word in keyword list lager than `0.7`(this number can change based on needs). 

* I tried use the total scores of all vectors(word-in-sentence to word-in-keywords), however, the results are very strange.

In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
aspect_keywords_dic = {
    'location': ['region', 'locality', 'neck_of_the_woods', 'location', 'vicinity',
                 'neighbourhood', 'neighborhood'],
    'cleanliness': ['tidy_up', 'straighten_out', 'cleanliness', 'clean', 
                    'neaten', 'square_away', 'straighten', 'clean_house', 'make_clean', 
                    'tidy', 'houseclean', 'clean_up', 'scavenge',
                   'soiled', 'unclean', 'colly', 'bemire', 'uncleanliness', 'soil', 'begrime',
                    'grime', 'untidy', 'dirty']
}

def sim_sents(doc, aspect, sim):
    sim_sents = []
    aspect_keywords = nlp(' '.join(aspect_keywords_dic[aspect]))
    
    for which_sen in range(len(list(doc.sents))):
        new_doc = list(doc.sents)[which_sen].text        
        sen_keywords = nlp(new_doc)
        
        if compute_score(sen_keywords, aspect_keywords, sim) > 0:
            sim_sents.append(new_doc)
    
    return sim_sents

def compute_score(sen_keywords, aspect_keywords, sim):
    count = 0
    for token1 in sen_keywords:
        for token2 in aspect_keywords:
            if token1.similarity(token2) >= sim:
                count += 1 
    return count

In [10]:
def asp_rlt_sents_of_homes(aspect, sample_df):
    
    score_dic = {}
    score_name = 'scores_'+aspect
    score_series = sample_df.groupby(['home_id'])['review_id'].count()
    summary_df = pd.DataFrame(columns=['home_id','num_of_reviews', 'num_of_sents',
                                       'aspect', 'num_of_sents_0.5', 'num_of_sents_0.6', 
                                       'num_of_sents_0.7', 'sents_0.5', 'sents_0.6', 'sents_0.7'])

    for i in range(len(score_series)):
        
        print("home" + str(i+1))
        home_id = score_series.index[i]
        num_of_reviews = score_series.values[i]
        num_of_sents = count_sents(home_id, sample_df)[0]
        num_of_sents_05, sents_05 = home_sents_process(home_id, sample_df, aspect, 0.5)
        num_of_sents_06, sents_06 = home_sents_process(home_id, sample_df, aspect, 0.6)
        num_of_sents_07, sents_07 = home_sents_process(home_id, sample_df, aspect, 0.7)

        summary_df.loc[i] = [home_id, num_of_reviews, num_of_sents, aspect, num_of_sents_05, 
                             num_of_sents_06, num_of_sents_07, sents_05, sents_06, sents_07]

    return summary_df

def count_sents(home_id, sample_df):
    # all comments together
    comments_list = [x for x in list(sample_df[sample_df.home_id == home_id].comments) if isinstance(x, str)]
    comments = ''.join(comments_list)
    doc = nlp(comments)
    
    return len(list(doc.sents)), doc
    
def home_sents_process(home_id, sample_df, aspect, sim):
    doc = count_sents(home_id, sample_df)[1]
#     sents_score_dic = sents_sim_score(doc, aspect, sim)
#     home_asp_sents = sents_score_dic[sents_score_dic['aspect_sim_score'] != 0].text.values
    home_asp_sents = sim_sents(doc, aspect, sim)

    return len(home_asp_sents), home_asp_sents

### Let's see the result of homes with different cleanliness score.

In [None]:
# Load Data: hr means homes&reviews
clean_score7_hr_df = pd.read_csv("../Data/clean_score7_df.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1)
print(len(clean_score7_hr_df))

clean_score7_homes_sents = asp_rlt_sents_of_homes('cleanliness', clean_score7_hr_df)
clean_score7_homes_sents.to_csv('../Data/clean_hs_df_7.csv', sep='\t', encoding='utf-8')
clean_score7_homes_sents.head(3)

652
home1
home2
home3
home4
home5
home6
home7
home8
home9
home10
home11
home12
home13
home14
home15
home16
home17
home18
home19
home20
home21
home22
home23
home24
home25
home26
home27
home28
home29
home30
home31
home32
home33
home34
home35
home36
home37
home38
home39
home40
home41
home42
home43
home44
home45
home46
home47
home48
home49
home50
home51
home52
home53
