# Review

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils, plot_help

from functools import reduce



%matplotlib inline

In [2]:
#modify read_limit and chunk_size parameters based on memory capacity
df = utils.chunk_loader('data/review.json', read_limit=-1)

In [3]:
#extract year from datetime series
#make as type string for easier manipulation
df['year'] = df.date.dt.year.astype('str')
df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,year
0,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,Q1sbwvVQXV2734tPgoKj4Q,1,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA,2013
1,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,GJXCdrto3ASJOqKeVWPi6Q,5,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg,2017
2,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,2TzJjDVDEuAW6MR5Vuc1ug,5,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw,2016
3,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,yi0R0Ugj_xUx_Nek0-_Qig,5,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg,2018
4,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,11a8sVPMUFtaC7_ABRkmtw,1,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ,2018


## Group by business ID + Year and take the mean of reactions to the business

In [4]:
#take the average by year + business id
review_year = df.groupby(by=['business_id', 'year'], as_index=False).mean()
#sort
review_year = review_year.sort_values(by=['business_id', 'year'])
review_year.head()

Unnamed: 0,business_id,year,cool,funny,stars,useful
0,--1UhMGODdWsrMastO9DZw,2016,1.0,0.0,3.818182,1.363636
1,--1UhMGODdWsrMastO9DZw,2017,0.454545,0.090909,4.636364,0.181818
2,--1UhMGODdWsrMastO9DZw,2018,0.0,0.0,3.25,0.0
3,--6MefnULPED_I942VcFNA,2008,7.0,8.0,5.0,8.0
4,--6MefnULPED_I942VcFNA,2010,3.0,1.333333,2.333333,4.0


## It is said that a business is successful if it can survive the test of time. We can use the review data to infer how a business is doing over the years. 
## For example we can measure the average change in star rating over the years. In the abscense of data in the year prior, assume the rating is unchanged since the last time it was recorded.

In [5]:
review_year.pivot('business_id','year', 'stars').head()

year,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
--1UhMGODdWsrMastO9DZw,,,,,,,,,,,,,3.818182,4.636364,3.25
--6MefnULPED_I942VcFNA,,,,,5.0,,2.333333,3.0,3.0,2.5,3.0,3.444444,3.75,3.0,3.25
--7zmmkVg-IMGaXbuVd0SQ,,,,,,,,,,,4.0,3.933333,4.142857,3.611111,4.0
--8LPVSo5i0Oo61X01sV9A,,,,,,,,,,,5.0,,3.0,5.0,1.0
--9QQLMTbFzLJ_oT-ON3Xw,,,,,,,,,,4.5,5.0,5.0,2.4,3.0,3.0


In [6]:
def feature_year_change(df_year, index, columns, values):
    
    #pivot table to evaluate change in stars
    df_year_feature = df_year.pivot(index=index, 
                                      columns=columns, 
                                      values=values)
    
    #convert to list of lists
    df_feature_list = df_year_feature.values

    df_no_nan = []

    #drop null values
    for array in df_feature_list:
        df_no_nan.append([x for x in array if pd.isna(x) != True])   

    df_no_nan = np.array([np.array(x) for x in df_no_nan])
    
    #save on memory
    del df_feature_list

    #get change in feature
    diffs = []
    for x in df_no_nan:
        if len(x)>2:
            diffs.append(np.diff(x))
        else:
            diffs.append(0)
    
    #again save memory
    del df_no_nan
    
    #get average change in feature value
    feature_change = []
    for x in diffs:
        feature_change.append(np.mean(x))
        
    #return as pandas object
    return pd.DataFrame.from_dict({'index': df_year_feature.index.tolist(),
                                   values: feature_change})


In [7]:
feat_year_list = []

for feature in ['cool', 'funny', 'stars', 'useful']:
    feat_year_list.append(feature_year_change(review_year, 'business_id','year', feature))

In [8]:
feat_year_list[0].head()

Unnamed: 0,index,cool
0,--1UhMGODdWsrMastO9DZw,-0.5
1,--6MefnULPED_I942VcFNA,-0.736111
2,--7zmmkVg-IMGaXbuVd0SQ,0.272727
3,--8LPVSo5i0Oo61X01sV9A,0.0
4,--9QQLMTbFzLJ_oT-ON3Xw,-0.2


In [9]:
#apply function of two arguments cumulatively to the items of iterable
df_year_change = reduce(lambda left, right: pd.merge(left, right, on='index'), feat_year_list)

df_year_change = df_year_change.rename(columns={'index': 'business_id', 
                                                'cool': 'cool_change', 
                                                'funny': 'funny_change', 
                                                'stars': 'stars_change'})

df_year_change.head()

Unnamed: 0,business_id,cool_change,funny_change,stars_change,useful
0,--1UhMGODdWsrMastO9DZw,-0.5,0.0,-0.284091,-0.681818
1,--6MefnULPED_I942VcFNA,-0.736111,-0.847222,-0.194444,-0.763889
2,--7zmmkVg-IMGaXbuVd0SQ,0.272727,0.136364,0.0,0.181818
3,--8LPVSo5i0Oo61X01sV9A,0.0,0.0,-1.333333,-1.333333
4,--9QQLMTbFzLJ_oT-ON3Xw,-0.2,0.0,-0.3,-0.1


In [10]:
df_year_change.to_csv('data/cleaned/review_year_change.csv')