### Final Project 

In [2]:
# import all the packages used 

import pandas as pd
import json
import numpy as np
import requests 
import requests_cache
import time
import lxml.html as lx
import xlwt 

requests_cache.install_cache("yelp")

import plotnine
from plotnine import *


In [None]:
# code to get all the businesses in New York 
def read_key(keyfile):
    with open(keyfile) as f:
        return f.readline().strip("\n")
    
# read the files: 
key = read_key("../Documents/yelp_api_2.txt")
headers = {'Authorization': 'Bearer %s' % key}


# the function to get the data with imput location and number of results
def get_business(loc, num):
    """
    This function takes a state, for example: New York, and the number of restaurants needed from that location.
    And it returned a dataframe the information obtained for all the reataurants. 
    """
    df = pd.DataFrame()
    n = 0
    while n <= num: 
        headers = {'Authorization': 'Bearer %s' % key}
        url = " https://api.yelp.com/v3/businesses/search"
        try: 
            req = requests.get(url,  headers = headers, params = {
                         "term": "restaurants | food", 
                         "location": loc,
                         "limit": 50,
                         "offset": n})
            result = pd.DataFrame(req.json()["businesses"])
        except KeyError:
            break
        df = df.append(result, ignore_index = True)
        df["state"] = loc
        n += 50
        print(n)  
        print(loc)
    return (df)

# Get the information of the first 1000 restaurants in the New York City and save it in a csv file.  
ny_rst = get_business("New York", 1000)
ny_rst.to_csv(r'ny_rst.csv') 

In [None]:
# read the information from the saved file and get the urls for the first 1000 restaurants. 
ny_rst = pd.read_csv("ny_rst.csv")
ny_url = ny_rst["url"]

def check_url(link):
    """
    This function take a link and return the status. If 200 is returned, the link is working. 
    """
    req = requests.get(link)
    time_fake = np.random.uniform(low=0.5, high=1.5)
    time.sleep(time_fake)
    x = 'The status code is {}'.format(req.status_code)
    print(time_fake)
    return x

# check all urls are valid, no 404 errors. 
result = [check_url(ny_url.iloc[i]) for i in range(len(ny_url))] # all valid 

In [None]:
# code to scrape the information of each restaurant:

ny_rst = pd.read_csv("ny_rst.csv")
ny_url = ny_rst["url"]
ny_id = ny_rst["id"]

def save_html(url):
    
    """
    This function takes url, makes get request, then parse and return html
    """
    
    sleeptime = np.random.uniform(low = 0.5, high = 1.5)
    time.sleep(sleeptime)
    
    response = requests.get(url)
    response.raise_for_status()
    
    html = lx.fromstring(response.text)
    html.make_links_absolute(url)
    
    return html

def one_row(row):
    """
    This function takes in one row of html table, and return a tuple with of length two. Position one is the 
    attribute, and position two is the value.
    """
    return row.xpath(".//dt")[0].text_content().strip(), row.xpath(".//dd")[0].text_content().strip()


def save_review(html):
    """
    This funtion get all customer reviews for one html.
    
    """
    links = html.xpath("//p[@lang  = 'en']")
    result = [l.text_content().strip() for l in links]
    return result

def get_everything(html, biz_id):
    
    """
    This function takes in html and id. It return a dictionary with restaurant reviews, attributes, review ratings,
    and biz_id for a given restaurant.
    """
    
    # get reviews
    review = save_review(html)
    
    # get attributes
    tab = html.xpath("//div[@class  = 'ywidget']//div[contains(@class, 'short-def-list')]")
    
    if len(tab) == 0:
        attrib = "biz_id", biz_id
    else:
        tab = tab[0]
        attrib = [one_row(r) for r in tab]
        item = "biz_id", biz_id
        attrib.append(tuple(item))
        
    # get review rating     
    review_rating = html.xpath("//div[@class = 'biz-rating biz-rating-large clearfix']/div//div/@title")
    
    return {"attributes" : attrib, "review" : review, "review rating" : review_rating, "biz_id": biz_id}

In [None]:
# final version

html = [save_html(u) for u in ny_url[0:1000]]
everything = [get_everything(html[i], ny_id[1000 + i]) for i in range(len(html))]
everything = pd.DataFrame(everything)
everything.to_csv(r'everything.csv')


In [None]:
total = pd.read_csv("/Users/apple/Desktop/everything.csv")
reviews = total['review']
bus_id = total['biz_id']

import nltk
import xlrd
import string
import nltk.corpus
from nltk.corpus import wordnet

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import re
nltk.download('stopwords')
import nltk.corpus as corpus
nltk.download('vader_lexicon')
import re

stopwords = corpus.stopwords.words("english")

import ast 
from statistics import mean
import itertools
from itertools import chain
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def get_adj_and_adv(text):
    """
    This functionis to firstly tokenize the words and then select
    the words that is tagged as adverbe and adjective
    """
    text_lower = text.lower()
    text_token = word_tokenize(text_lower)
    result_tags = nltk.pos_tag(text_token)
    
    words = [(word) for word, tag in result_tags if tag in ('JJ','RB')]
    return (words)

def get_noun(text):
    """
    This functino is to tokenize the words and select the words
    that is tagged as noun
    """
    text_lower = text.lower()
    text_token = word_tokenize(text_lower)
    result_tags = nltk.pos_tag(text_token)
    
    words = [(word) for word, tag in result_tags if tag in ('NN')]
    return (words)

def nltk_sentiment(sentence):
    """
    This function is to process the sentiment on each tokenized sentences
    and then generate a sentiment value for each sentence
    """
    
    nltk_sentiment = SentimentIntensityAnalyzer()
    score = nltk_sentiment.polarity_scores(sentence)
    return score


def get_compound(review_total_restaurant):
    """ 
        This function is to get the compound value, which is the weight value of negative, positive and netural words
        for each restaurant
    """
    if len(review_total_restaurant) == 0: # test if there is a review if not, the value would return 0
        x2 = 0
    else:
        x = [nltk_sentiment(review_total_restaurant[i]) for i in range(len(review_total_restaurant))] 
        # get the three values for each review sentences
        x1 = [(list(x[i].items())[-1][1]) for i in range(len(x)) ]
        # get the compund view of each reivew
        x2 = mean(x1)
        #calculate the average of the compound value
    return x2

def get_biz_id(reviews):
    """
    This function is to get a bussiness id for each review
    """
    total_rest = [ast.literal_eval(reviews[i]) for i in range(len(reviews))]
    test = [(len(total_rest[i]) *  te[i]) for i in range(len(te))]
    split_mul_string = [splitstring(test[i]) for i in range(len(test))]
    sp_mul_str = list(itertools.chain.from_iterable(split_mul_string))
    result = pd.DataFrame(sp_mul_str)
    return result

testing666 = pd.DataFrame(total_rest)
# we first conver the reviews to a dataframe
testing667 = testing666[testing666.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
# Then we join all the colouns (reviews) of each row(restruant) to
# a one colmun dataframe

testing668 = pd.DataFrame(testing667)
# new we convert the combined reviews as a dataframe

testing668 = testing668.rename(columns={0: "review"})
# rename it 

bus_id.reset_index(drop=True, inplace=True)
testing668.reset_index(drop=True, inplace=True)

testing669 = pd.concat([testing668, bus_id], axis=1)

rew_tok = [sent_tokenize(testing667[i]) for i in range(len(testing667))]
# now tokenize each sentences of the review to parper for the 
#sentiment analysis
x2 = [get_compound(rew_tok[i]) for i in range(len(rew_tok))]
# get the average sentiment analysis result for each resutrant

compd =pd.DataFrame(x2)
compd = compd.rename(columns={0: "Score"})
#Convert the sentiment analysis result to a dataframe

compd.reset_index(drop=True, inplace=True)

df_review_w_score = pd.concat([testing669,compd],axis = 1) 
#join it as a dataframe 
# the dataframe contains the combined reviews for each restruant and the 
#average sentiment scores for each restruant reviews

In [None]:
rev =  [re.sub(r'[^\w\s]','',reviews[i]) for i in range(len(reviews))] # tokenize by words
# now to tokenize the scentence

review_noun = [get_noun(rev[i]) for i in range(len(rev))] # get the noun for 

review_adj = [get_adj_and_adv(rev[i]) for i in range(len(rev))]

review_noun_total = [j for i in review_noun for j in i]
# combine the nouns to one list
review_adj_total = [j for i in review_adj for j in i]
#combine the adjs to one list

text_token = [word_tokenize(rev[i]) for i in range(len(rev))]

text_token_total = [j for i in text_token for j in i]


# these are the stop words we want to remove 
exclude_noun = ['i','food']
exclude_adj = ['not','i','so','very','here','just','also','really','too','well','other','first','','restaurant','next','then','as','lo','as','again']
review_noun_total_re_stopwords = [w for w in review_noun_total if w  not in exclude_noun ]

review_adh_total_re_stopwords = [w for w in review_adj_total if w  not in exclude_adj ]
fd_n = nltk.FreqDist(review_noun_total_re_stopwords)

In [None]:
total_rest = [ast.literal_eval(reviews[i]) for i in range(len(reviews))]
total_score_compound = [[get_compound(x) for x in total_review_token[i]] for i in range(len(total_review_token)) ]
# this part is to get the vander value for each review and get each
#review text
testing26 = list(itertools.chain.from_iterable(total_rest))
# compress the lists into one list
testing25 = list(itertools.chain.from_iterable(total_score_compound))
review_df = pd.DataFrame(testing26)
review_df = review_df.rename(columns={0: "Review"})
score_df = pd.DataFrame(testing25)
score_df = score_df.rename(columns={0: "Compound Value"})
testing1001 = total['biz_id']
# we get the bussiness id
test = [(len(total_rest[i]) *  testing1001[i]) for i in range(len(testing1001))]
# get the equal number of reviews and bussiness ids
split_mul_string = [splitstring(test[i]) for i in range(len(test))]
# slit the business ids to one list
sp_mul_str = list(itertools.chain.from_iterable(split_mul_string))
sp_mul_str_df = pd.DataFrame(sp_mul_str)
#make the bussiness id as a dataframe
sp_mul_str_df = sp_mul_str_df.rename(columns = {0:"biz_id"})
score_df.reset_index(drop=True, inplace=True)
review_df.reset_index(drop=True, inplace=True)
df_y = pd.concat([score_df, review_df], axis=1)
df_y.reset_index(drop=True, inplace=True)
sp_mul_str_df.reset_index(drop=True, inplace=True)
df_x = pd.concat([review_df, sp_mul_str_df], axis=1)

In [None]:
everything = pd.read_csv("/Users/bckou/Documents/everything.csv",index_col=0)
col_attr = everything["attributes"]

# the function to format each restaurant's attributes into a dataframe with name of attributes as the column name. 
def format_attribute_for_list(num,attr):
    bbb = ast.literal_eval(attr[num])
    if (bbb[0]=="biz_id"): 
        ccc = pd.DataFrame(bbb)
        ccc.columns = ccc.iloc[0]
        ccc = ccc.iloc[1:]
    else:
        bbb = pd.DataFrame(bbb)
        ccc = pd.DataFrame(bbb.T)
        
        ccc.columns = ccc.iloc[0]
        ccc = pd.DataFrame(ccc)
        ccc = ccc.iloc[1:]
   # except (ValueError,IndexError):
    #    ccc = pd.DataFrame(["NA"])
    return ccc

# get a list of dataframes: 1000 observations
list_attribute = [format_attribute_for_list(obs,col_attr) for obs in range(0,1000)]

# append all 1000 dataframes together into one dataframe. 
all_pd = pd.DataFrame().append(list_attribute)

all_pd.to_csv(r'all_attributes.csv') # save the result into a csv file 

In [None]:
# read all the revelant files
review = pd.read_csv("/Users/bckou/Documents/review+id+score.csv")
attributes = pd.read_csv("/Users/bckou/Documents/all_attributes.csv")
ny_rst = pd.read_csv("../Documents/ny_rst.csv")

# get the id, rating and review count for each reataurant.
ny_more = ny_rst[["review_count","price","id"]]
ny_more = ny_more.rename(columns={"id":"biz_id"})
ny_more["price_num"] = [len(ny_more["price"][i]) if type(ny_more["price"][i]) == str else None for i in range(len(ny_more))]


# merge the review information, rating, review count and all the attributes together into one dataframe
all_data = pd.merge(review, attributes, on='biz_id')
model_dat = pd.merge(all_data, ny_more, on='biz_id')

# save the dataframe as an excel file. 
model_dat.to_excel(r"model_dat.xlsx") 

In [None]:
# check the number of missing values in each column(different attributes)

# fill all the NaN with Missing. 
filled_model_dat = model_dat.replace(np.nan, 'Missing', regex=True)
filled_model_dat.columns

aaa = pd.DataFrame(model_dat.isnull().sum(axis = 0))
aaa.drop(aaa.index[0], inplace=True)
aaa.columns = ["number of NA"]
aaa.index.name = "attributes"
bbb = aaa.reset_index()
ccc = bbb.sort_values(by = ["number of NA"],ascending = True)
ccc

In [3]:
model_dat = pd.read_excel("/Users/bckou/Documents/model_dat1.xlsx")

In [None]:
# explore the random forest model

import ast
from sklearn.model_selection import train_test_split

# replace Yes; No with numbers 
new_model_dat = model_dat.replace("Yes",1)
new_model_dat = new_model_dat.replace("Free",1)
new_model_dat = new_model_dat.replace("No",0)
new_model_dat = new_model_dat.replace("Paid",0)

new_model_dat = new_model_dat.fillna(value = 0)

X = new_model_dat[["Takes Reservations","Accepts Credit Cards","Take-out","Accepts Apple Pay",
                   "Delivery","Has TV","Good for Kids","Outdoor Seating","Good for Groups","Wi-Fi",
                   "Score","price_num","review_count","Bike Parking"]].values 
X = X.astype(int)
y = new_model_dat.iloc[:, 53].values

# divide the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0

In [None]:
# Training the Algorithm
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def random_forest_regression(num):
    
    regressor = RandomForestRegressor(n_estimators=num, random_state=0)  
    regressor.fit(X_train, y_train)  
    y_pred = regressor.predict(X_test)  

    # Evaluating the Algorithm
    MAE = metrics.mean_absolute_error(y_test, y_pred)
    MSE = metrics.mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    
    return [MAE,MSE,RMSE]


In [None]:
model_result = pd.DataFrame([random_forest_regression(num) for num in range(1,100) ])
model_result.columns = ['MAE', 'MSE',"RMSE"]
model_result["num"] = [i for i in range(1,100)]

In [None]:
regressor = RandomForestRegressor(n_estimators=75, random_state=0)  
regressor.fit(X_train, y_train)  
y_pred = regressor.predict(X_test)
y_pred = pd.DataFrame(y_pred)
y_pred = round(y_pred,1)
y_res = y_pred - pd.DataFrame(y)
(ggplot(y_pred)+ geom_histogram(aes("y_pred"),bins = 20))

In [None]:
y_res = y_pred - pd.DataFrame(y)
(ggplot(y_res)+ geom_histogram(aes("y_res"),bins = 10))


In [None]:
# plot the MAE, MSE and RMSE plots against the number of estimators to get the best estimator number.
new = model_result[["num",'MAE', 'MSE',"RMSE"]]

print(
ggplot(new)+
geom_point(aes(x = "num", y = "MAE"), color = "red"))

print(
ggplot(new)+
geom_point(aes(x = "num", y = "MSE"), color = "green"))

print(
ggplot(new)+
geom_point(aes(x = "num", y = "RMSE"), color = "blue"))