# NLP with tips of restaurant business

references:

https://www.kirenz.com/post/2021-12-11-text-mining-and-sentiment-analysis-with-nltk-and-pandas-in-python/text-mining-and-sentiment-analysis-with-nltk-and-pandas-in-python/

https://blog.ekbana.com/pre-processing-text-in-python-ad13ea544dae

In [47]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import os
import nltk

In [59]:
PROJECT_ID = "yelp-data-warehouse"
DATASET_ID = "yelp_dataset"
TIPS_TABLE_ID = "yelp_tips"

GOOGLE_APPLICATION_CREDENTIALS = 'path/to/key.json'

client = bigquery.Client.from_service_account_json(GOOGLE_APPLICATION_CREDENTIALS,project = PROJECT_ID)

TIPS TABLE attributes: 'text', 'names' (business name), 'business_id', 'user_id'

In [42]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

# Make a list of english stopwords
stopwords = nltk.corpus.stopwords.words("english")

regexp = RegexpTokenizer('\w+')

analyzer = SentimentIntensityAnalyzer()

wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')

### 1. Given a business id, output the score for all the tips tied to this business

In [60]:
def analyse_by_biz(business_id):
    query = f"SELECT * FROM {DATASET_ID}.{TIPS_TABLE_ID} WHERE business_id = '{business_id}'"

    print(query)
    result_df = client.query(query).to_dataframe()
    result_df['text'] = result_df['text'].str.lower()

    # Tokenisation, remove any duplicates '\w+'
    result_df['text_token'] = result_df['text'].apply(regexp.tokenize)
    
    # remove stop words, numbers
    result_df['text_token'] = result_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords and not item.isnumeric()])

    # lemmatize
    result_df['text_lem'] = result_df['text_token'].apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x])
    result_df['text_lem'] = result_df['text_lem'].apply(lambda x : " ".join(x))

    # sentiment analysis
    result_df['polarity'] = result_df['text_lem'].apply(lambda x: analyzer.polarity_scores(x))

    # change dataframe
    result_df = pd.concat(
        [result_df, 
         result_df['polarity'].apply(pd.Series)], axis=1)
    
    result_df['sentiment'] = result_df['compound'].apply(lambda x: 'positive' if x >0 else 'neutral' if x==0 else 'negative')
    
    print(result_df['sentiment'].value_counts())
    # return result_df

In [61]:
test_id = 'sPxAZRMzMsvwO2e8impPBA'

analyse_by_biz(test_id)

SELECT * FROM yelp_dataset.yelp_tips WHERE business_id = 'sPxAZRMzMsvwO2e8impPBA'
neutral     13
positive    13
negative     3
Name: sentiment, dtype: int64
