In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from statsmodels import graphics
from patsy import dmatrices
import sys
try:
    sys.getwindowsversion()
except AttributeError:
    isWindows = False
else:
    isWindows = True

if isWindows:
    import win32api,win32process,win32con
    pid = win32api.GetCurrentProcessId()
    handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
    win32process.SetPriorityClass(handle, win32process.HIGH_PRIORITY_CLASS)
plt.rcParams['figure.figsize'] = [15, 10]

  from pandas.core import datetools


# Read data

In [2]:
reviews = pd.read_csv('data/reviews.csv')

# Merge external datasets to add long, lat, find distance in miles

In [3]:
#Import csv files for user and restraurant long and lats
geocoded_restaurant = pd.read_csv('data/geocoded_restaurant2.csv')
reviews = reviews.merge(geocoded_restaurant, left_on='address', right_on='address', how='left')

geocoded_user = pd.read_csv('data/geocoded_user2.csv')
reviews = reviews.merge(geocoded_user, left_on='localtion', right_on='localtion', how='left')

In [4]:
#find distance between user and restaurant
def haversine_np(lon1, lat1, lon2, lat2):

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    mi = 3956 * c # Radius of earth in miles
    return mi

In [5]:
reviews['Rev_Dist_Miles'] = reviews.apply(lambda row: haversine_np(row['Rest_long'], 
                                                                   row['Rest_lat'], 
                                                                   row['User_long'], 
                                                                   row['User_lat']), axis=1)

# Rename some columns for better plots and write-up

In [6]:
reviews = reviews.rename(columns={"friends":"Friends", 
                                  "has_photo":"Profile_Pic_Bool", 
                                  "photos":"Tot_Photos", 
                                  "rating":"User_Rating", 
                                  "reviews":"User_Tot_Reviews", 
                                  "numSentence":"Review_Sentence", 
                                  "totSentiment":"Rev_Tot_Sentiment", 
                                  "avgSentiment":"Rev_Avg_Sentiment", 
                                  "regular":"Recommended", 
                                  "n_reviews":"Tot_Rest_Reviews", 
                                  "number_restaurants":"Tot_Rest_In_City", 
                                  "restaurant_number":"Yelp_Rest_Order", 
                                  "regularRatio":"Ratio_Recommended",
                                  "city":"City",
                                  "restaurant_id":"Restaurant_ID",});

# Features Transformations

In [7]:
log_columns = ['Friends',
               'Days_Since_10_2004',
               'Review_Sentence', 
               'Tot_Photos', 
               'User_Tot_Reviews', 
               'Rev_Tot_Sentiment',
               'Tot_Rest_Reviews',
               'Tot_Rest_In_City',
               'Review_Words',
               'Review_Words_No_Stopwords',
               'Rev_Dist_Miles']

for column in log_columns:
    reviews[column+'_Log'] = np.log(reviews[column]+1)

#really didn't need any of this    
    
# balanced_reviews['friends_photos'] = np.log(balanced_reviews['photos']+1) * np.log(balanced_reviews['friends']+1)
# balanced_reviews['extream_avgSentiment'] = np.log(balanced_reviews['extream']+1) * np.log(balanced_reviews['avgSentiment']+1)

# second order of polynomial
# balanced_reviews['photos_2'] = balanced_reviews['photos']**2
# del balanced_reviews['photos']
# balanced_reviews['friends_2'] = balanced_reviews['friends']**2
# del balanced_reviews['friends']

reviews = reviews.drop(columns=log_columns)

# Remove Columns Not Needed for Analysis

In [8]:
#drop nonnumerical, but use city as a class for later modeling, FYI
reviews.drop(columns=['localtion','address','Rest_long','Rest_lat','User_long','User_lat','City','numWords'], inplace=True)

In [9]:
#export for later geolocation analysis, perhaps map clustering
reviews.to_csv('data/1_reviews_complete.csv', index=False)

# Balance the data

In [10]:
balanced_reviews = pd.DataFrame()
for restaurant_id, group in reviews.groupby('Restaurant_ID'):
    take_n = min((group['Recommended'] == 0).sum(), (group['Recommended'] == 1).sum())
    if take_n == 0:
        continue
    reg_reviews = group[group['Recommended'] == 1].sample(n=take_n, random_state=0)
    not_reviews = group[group['Recommended'] == 0].sample(n=take_n, random_state=0)
    group_reviews = reg_reviews.append(not_reviews)
    
    assert group_reviews['Recommended'].mean() == .5
    balanced_reviews = balanced_reviews.append(group_reviews)

assert balanced_reviews['Recommended'].mean() == .5
len(balanced_reviews)

53292

In [11]:
balanced_reviews['Recommended'].mean()

0.5

# Export balanced_reviews for use of R and SAS

In [12]:
balanced_reviews.to_csv('data/balanced_reviews.csv', index=False)

# At this point all vaiables should be numerical!

In [13]:
balanced_reviews = balanced_reviews.astype(np.float32)

# Cook's distance

Cook's distance is an estimation of influence of data points for a regression model.

# No Cook's distance for now because it take too much time

In [14]:
# start = time.time()
# features = balanced_reviews.columns.tolist()
# features.remove('Recommended')
# features = '+'.join(features)
# model = smf.ols("Recommended ~ %s" % features, balanced_reviews).fit()

# influence = model.get_influence()
# influence_summary = influence.summary_frame()
# influence_summary.head()
# time.strftime("%H:%M:%S", time.gmtime(time.time() - start))

# Plot Cook's distance

In [15]:
# plt.plot(influence_summary.index, influence_summary['cooks_d'], 'o')
# plt.axhline(y=60/(len(balanced_reviews)-1), color='r', linestyle='-')
# plt.axhline(y=.0005, color='r', linestyle='-')

# Filter outliers

In [16]:
# balanced_reviews = balanced_reviews[influence_summary['cooks_d'] < 0.005]

In [17]:
# balanced_reviews.reset_index(inplace = True)
# del balanced_reviews['index']
# len(balanced_reviews)

In [18]:
balanced_reviews.to_csv('data/2_not_scaled_data.csv', index=False)

In [19]:
balanced_reviews['Recommended'].mean()

0.5

# Scale data

In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

#I saw the math and outputs, if recommended is 0 to 1, this does not need to be done because the output is same
regular = balanced_reviews['Recommended']

del balanced_reviews['Recommended']
columns = balanced_reviews.columns
balanced_reviews = scaler.fit_transform(balanced_reviews)
balanced_reviews = pd.DataFrame(balanced_reviews, columns=columns)

balanced_reviews['Recommended'] = regular.values

# Export data

In [21]:
balanced_reviews.drop(columns=['Restaurant_ID'], inplace=True)

In [22]:
balanced_reviews.to_csv('data/ready_data.csv', index=False)

In [23]:
balanced_reviews['Recommended'].mean()

0.5