In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import csv
import pickle
import gensim
import warnings
warnings.filterwarnings('ignore')  # to suppress warnings



In [2]:
# load review data
filePath = "../Data/"
review_by_zipcode = pickle.load( open( filePath+"balanced_review_by_zipcode.p", "rb" ) ) # Dataset accessible from https://drive.google.com/open?id=1GbFVIStnd2xVnFTFE43KGkveO7w066aW

## Get word count for each review

In [3]:
import importlib.util
spec = importlib.util.spec_from_file_location("nltk_pack", "../toolkit/nltk_pack.py")
pack = importlib.util.module_from_spec(spec)
spec.loader.exec_module(pack)

In [4]:
# this function operates on one row of review_by_zipcode.
# Clean the text and extracts unique words and corresponding word frequency from the concatenated review in this row
# @param rev is one row in the review_by_zipcode
# @returns a df with first column is the unique words and second column being the word frequency
def getWordCt(rev):
    word_count = pack.getwordlist([rev["review"]])
    return word_count

# Compare all words against each word in the dictionary of user-defined words related to safety
# and output a similarity score for each dict word per row in the review_by_zipcode
# @param myDict a list of words related to safety
# @rev is one row in the review_by_zipcode data set
# @returns a vector with its length = the size of myDict
def getSimilarityScore(rev, myDict):
    wordCt = getWordCt(rev)
    
    # initizalize similarity matrix: nRow = number of unique words from wordCt, ncol = number of words in dictionary
    simMat = np.zeros((wordCt.shape[0],len(myDict)))
    
    # iterate through all unique words (rows)
    for i in range(wordCt.shape[0]):
        # iterate through all benchmark words (cols)
        for j in range(len(myDict)):
            # if a word does not exist in the Google model (error thrown), assign 0 to the similarity score.
            try:
                simMat[i,j] = wv.similarity(wordCt.loc[i,"index"], myDict[j]) * wordCt.iloc[i,1]  # calculate similarity score between each unique word and each benchmark word. weight by the frequency of the word
            except:
                simMat[i,j] = 0
    # calculate similarity score for each zipcode as whole by summing up the score for each word and divided by number of reviews for this zipcode
    simScores = np.sum(simMat,axis = 0)/rev.ct  # should be a 1 x k vector, k = size of dictionary
    return simScores           

## Calculate similarity features for all zipcodes

In [5]:
safetyKeyWd = ["good", "safe","night", "walk", "unsafe", "bad", "dangerous"]
# load pre-trained word2vec model from Google
wv = pickle.load( open( filePath+"wv.p", "rb" ) ) # model accessible from https://drive.google.com/open?id=1bKML3D_7AQfoZg_z7ju--WoMMS-huPBY
    
# initialize numeric representation of similarity score
textFeatures = np.zeros((review_by_zipcode.shape[0],len(safetyKeyWd)))

nRow = review_by_zipcode.shape[0]

# calculate and aggregate similarity score for reviews corresponding to all zipcodes
for i, row in review_by_zipcode.reset_index().iterrows(): 
    textFeatures[i,:] = getSimilarityScore(row, myDict=safetyKeyWd)
    # output progress of this for-loop
    if( (i+1)%15 == 0 ):
        print("%d%% done" %((i+1)/nRow * 100))

5% done
10% done
15% done
20% done
25% done
30% done
35% done
40% done
45% done
50% done
55% done
60% done
65% done
70% done
76% done
81% done
86% done
91% done
96% done


In [8]:
textFeatureDF = pd.DataFrame(textFeatures, columns= safetyKeyWd, index=review_by_zipcode.index)
textFeatureDF.head()

Unnamed: 0_level_0,good,safe,night,walk,unsafe,bad,dangerous
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10019,,,,,,,
10023,,,,,,,
37738,,,,,,,
60601,,,,,,,
90001,3.635028,2.55469,1.890825,2.229291,1.415911,2.522453,1.730858


In [7]:
# Write result to local storage
pickle.dump(textFeatures, open( filePath+"textFeatures.p", "wb" ) )

In [12]:
textFeatureDF.to_csv(filePath+"textFeatures.csv", header=True)

In [18]:
# for testing
try:
    print(wv.similarity("safe", "tanya"))
except:
    print("Word do not exist")

Word do not exist


In [11]:
# for testing
review_by_zipcode.loc[37738,]
textFeatureDF.isna().sum(axis = 0)

good         17
safe         17
night        17
walk         17
unsafe       17
bad          17
dangerous    17
dtype: int64