In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
print(os.listdir('../input'))

# Loading data

In [3]:
import pandas as pd
business_json_path = '../input/yelp-dataset/yelp_academic_dataset_business.json'
df_b = pd.read_json(business_json_path, lines=True)

In [4]:
size = 100000
review = pd.read_json('../input/yelp-dataset/yelp_academic_dataset_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)


# There are multiple chunks to be read
chunk_list = []
for chunk_review in review:
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(df_b, chunk_review, on='business_id', how='inner')
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

# Exploratory Analysis

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
plt.hist(df["review_stars"],weights=np.ones(len(df["review_stars"])) / len(df["review_stars"]))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

We observed that over 40% reviews are 5 star and 2 star review is only around 8%. So, we have to normalize the number of reviews intermes of given star. In other word we have to take similar amount of review from each star given. 

## Normalizing reviews:

In [8]:
reviews_smallest = df[df["review_stars"] == 2]

## sampling one star review:
reviews_1 = df[df["review_stars"] == 1]
reviews_one = reviews_1.sample(n=reviews_smallest.shape[0])

## sampling three star review:
reviews_3 = df[df["review_stars"] == 3]
reviews_three = reviews_3.sample(n=reviews_smallest.shape[0])

## sampling four star review:
reviews_4 = df[df["review_stars"] == 4]
reviews_four = reviews_4.sample(n=reviews_smallest.shape[0])

## sampling five star review:
reviews_5 = df[df["review_stars"] == 5]
reviews_five = reviews_5.sample(n=reviews_smallest.shape[0])

uniformed_review = reviews_smallest.append([reviews_one, reviews_three,reviews_four,reviews_five])
uniformed_review.shape

So, after normalizaton we have around 3.5 million of reviews.

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
plt.hist(uniformed_review["review_stars"],weights=np.ones(len(uniformed_review["review_stars"])) / len(uniformed_review["review_stars"]))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

# Data preprocessing

At this stage I am preprocessing texts using Tensorflow and Keras. 

In [10]:
X, y = (uniformed_review['text'].values, uniformed_review['review_stars'].values)

In [None]:
#from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
#tk = Tokenizer(lower = True)
#tk.fit_on_texts(X)
#X_seq = tk.texts_to_sequences(X)
#X_pad = pad_sequences(X_seq, maxlen=100, padding='post')

In [13]:
#Tokenize the text
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True)

In [None]:
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [None]:
tf.keras.preprocessing.text.Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', char_level=False, oov_token=None,
    document_count=0, **kwargs
)