# Exploratory Data Analysis on final review yelp data

In [1]:
# standard library
import codecs
import os
import re
import sys
import warnings
from pathlib import Path

# 3rd party
import googlemaps as gmaps
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import datetime as dt
from google.cloud import storage
from google.cloud import language_v1
from google.cloud.language_v1 import enums

import matplotlib.pyplot as plt

In [2]:
# project config variables
BUCKET_NAME = 'YOUR-BUCKET'
INPUT_BUCKET_PREFIX = 'YOUR/DIRECTORIES/'
OUTPUT_BUCKET_PATH = 'YOUR/DIRECTORIES/'
URI_PREFIX = 'gs://'
ROOT_DIR = Path.cwd().parent

In [3]:
%load_ext autoreload
%autoreload 2

# add src library to module path
sys.path.append(str(ROOT_DIR))

In [4]:
# import modules from src
from src.utils import to_snake_case, clean_byte_unicode_chars, fix_encoding

# import secrets
from creds import API_KEY

In [5]:
# read in the business data from GCS
df = pd.read_csv('gs://YOUR-BUCKET/YOUR/DIRECTORIES/final_data')

In [6]:
print('The business data has {} rows and {} columns.'.format(df.shape[0], df.shape[1]))

The business data has 489834 rows and 16 columns.


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,review_count,categories,address,city,state,postal_code,latitude,longitude,stars,review_id,user_id,text,useful
0,0,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,170,"Sushi Bars, Restaurants, Japanese","10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,F7POrJsNbhu493DSTMPXjw,nsS4oDfOsl20QdWc6XcOkw,"Husband was craving Chicken Teriyaki & gyoza, ...",1.0
1,1,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,170,"Sushi Bars, Restaurants, Japanese","10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,c-6aA9Bd7JxpmMroRoas9A,bK4Y_GZUoAUTXIrmeEUGYw,Went there Saturday noon they open at 12pm but...,0.0
2,2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,170,"Sushi Bars, Restaurants, Japanese","10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,P_MV_lMQjLIO9R7M_KIuMg,sbbB6Y4yIZIInvt0LFYYew,Went to this restaurant on Saturday as I had a...,1.0
3,4,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,170,"Sushi Bars, Restaurants, Japanese","10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,GiBx0zRVxC1Bu-9R-MMF8w,XSz4svDvgOCRO92V_3ajWg,"Well first time I\'ve really had ""authentic"" J...",0.0
4,6,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,170,"Sushi Bars, Restaurants, Japanese","10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,g0gMjs8Q1Dl5AxrM8QMiiA,7YeRMeb_cppMjjHAhT_k9A,First of all it must be said that this is a JA...,4.0


In [24]:
def fix_encoding(text):
    if text is not None:
        text = codecs.decode(text, 'unicode_escape')
        text = ftfy.fix_text(text)
    return text

In [9]:
pd.set_option('display.max_columns', 60)
df.head(1)

Unnamed: 0.1,Unnamed: 0,business_id,name,review_count,categories,address,city,state,postal_code,latitude,longitude,stars,review_id,user_id,text,useful
0,0,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,170,"Sushi Bars, Restaurants, Japanese","10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,F7POrJsNbhu493DSTMPXjw,nsS4oDfOsl20QdWc6XcOkw,"Husband was craving Chicken Teriyaki & gyoza, ...",1.0


#### Check the sentiment

In [24]:
str = 'High marks for new mexican grill.  Very low key place.  They dont even have a sign out front.  However the food delivers.  I have had several things on the menu and they were all solid.  Sauces are over the top good.  Service is sometimes a little hit or miss, but always friendly.  Love the salsa bar'

In [27]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

# Instantiates a client
client = language.LanguageServiceClient()

# The text to analyze
text = str
document = types.Document(
    content=text,
    type=enums.Document.Type.PLAIN_TEXT)

# Detects the sentiment of the text
sentiment = client.analyze_sentiment(document=document).document_sentiment

print('Text: {}'.format(text))
print('Sentiment score: {}, Sentiment Magnitude: {}'.format(sentiment.score, sentiment.magnitude))

Text: High marks for new mexican grill.  Very low key place.  They dont even have a sign out front.  However the food delivers.  I have had several things on the menu and they were all solid.  Sauces are over the top good.  Service is sometimes a little hit or miss, but always friendly.  Love the salsa bar
Sentiment score: 0.4000000059604645, Sentiment Magnitude: 5.099999904632568


#### Classifying content

In [23]:
def sample_classify_text(text_content):
    """
    Classifying Content in a String

    Args:
      text_content The text content to analyze. Must include at least 20 words.
    """

    client = language_v1.LanguageServiceClient()

    # text_content = 'That actor on TV makes movies in Hollywood and also stars in a variety of popular new TV shows.'

    # Available types: PLAIN_TEXT, HTML
    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"content": text_content, "type": type_, "language": language}

    response = client.classify_text(document)
    # Loop through classified categories returned from the API
    for category in response.categories:
        # Get the name of the category representing the document.
        # See the predefined taxonomy of categories:
        # https://cloud.google.com/natural-language/docs/categories
        print(u"Category name: {}".format(category.name))
        # Get the confidence. Number representing how certain the classifier
        # is that this category represents the provided text.
        print(u"Confidence: {}".format(category.confidence))


In [28]:
sample_classify_text(str)

Category name: /Food & Drink/Restaurants
Confidence: 0.8899999856948853


#### Analyzing entities

In [29]:
def sample_analyze_entities(text_content):
    """
    Analyzing Entities in a String

    Args:
      text_content The text content to analyze
    """

    client = language_v1.LanguageServiceClient()

    # text_content = 'California is a state.'

    # Available types: PLAIN_TEXT, HTML
    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"content": text_content, "type": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = enums.EncodingType.UTF8

    response = client.analyze_entities(document, encoding_type=encoding_type)
    # Loop through entitites returned from the API
    for entity in response.entities:
        print(u"Representative name for the entity: {}".format(entity.name))
        # Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al
        print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name))
        # Get the salience score associated with the entity in the [0, 1.0] range
        print(u"Salience score: {}".format(entity.salience))
        # Loop over the metadata associated with entity. For many known entities,
        # the metadata is a Wikipedia URL (wikipedia_url) and Knowledge Graph MID (mid).
        # Some entity types may have additional metadata, e.g. ADDRESS entities
        # may have metadata for the address street_name, postal_code, et al.
        for metadata_name, metadata_value in entity.metadata.items():
            print(u"{}: {}".format(metadata_name, metadata_value))

        # Loop over the mentions of this entity in the input document.
        # The API currently supports proper noun mentions.
        for mention in entity.mentions:
            print(u"Mention text: {}".format(mention.text.content))
            # Get the mention type, e.g. PROPER for proper noun
            print(
                u"Mention type: {}".format(enums.EntityMention.Type(mention.type).name)
            )

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))


In [30]:
sample_analyze_entities(str)

Representative name for the entity: marks
Entity type: OTHER
Salience score: 0.2614773213863373
Mention text: marks
Mention type: COMMON
Representative name for the entity: grill
Entity type: CONSUMER_GOOD
Salience score: 0.2614773213863373
Mention text: grill
Mention type: COMMON
Representative name for the entity: Service
Entity type: OTHER
Salience score: 0.16536474227905273
Mention text: Service
Mention type: COMMON
Mention text: hit
Mention type: COMMON
Representative name for the entity: food
Entity type: OTHER
Salience score: 0.06001187488436699
Mention text: food
Mention type: COMMON
Representative name for the entity: place
Entity type: OTHER
Salience score: 0.051742829382419586
Mention text: place
Mention type: COMMON
Representative name for the entity: things
Entity type: OTHER
Salience score: 0.03790085390210152
Mention text: things
Mention type: COMMON
Representative name for the entity: front
Entity type: OTHER
Salience score: 0.034948043525218964
Mention text: front
Ment