In [1]:
import json
import csv
import pandas as pd
from sklearn import preprocessing
import numpy as np
import os
from collections.abc import Sequence
import matplotlib.pyplot as plt
import sklearn.feature_extraction.text as sk_text

In [8]:
### HELPER FUNCTIONS ###

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

In [24]:
#  tsv output file for cleaned up business json dataset
outfile = open("businesses.tsv", 'w')
businessfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL) 
businessfile.writerow(['business_id','name', 'stars', 'category'])

# Opens json dataset from path. 
with open('../yelp_dataset/yelp_academic_dataset_business.json', encoding="utf-8") as f: 
    for line in f:
        row = json.loads(line)
        # Only getting businesses with review count over 20
        if row['review_count'] > 20:
            # some special char must be encoded in 'utf-8' 
            businessfile.writerow([row['business_id'], row['name'], row['stars'], row['categories']])

# Close tsv file
outfile.close()

# Create pandas dataframe tsv output file
df = pd.read_csv('businesses.tsv', delimiter ="\t", encoding="utf-8")

In [12]:
# Hashset for business_ids of businesses used for project
# Will use this to search for the businesses chosen for project in the reviews json dataset
# and add it to reviews tsv
chosen_business = {'YZeUH6zYS0dq5QHLYZhUnQ', 'oiAlXZPIFm2nBCt0DHLu_Q', 'fNil19SUfPAPnLQrYnFrGQ', 'JjcJVqhZXhP4tvOhg3fnag'}


#  tsv output file for cleaned up review json dataset
outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL) 
sfile.writerow(['business_id','stars', 'text'])

# Opens json dataset from path. 
with open('../yelp_dataset/yelp_academic_dataset_review.json', encoding="utf-8") as f: 
    for line in f:
        row = json.loads(line)
        # If statement to look for our chosen businesses
        if row['business_id'] in chosen_business: 
            # some special char must be encoded in 'utf-8' 
            sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

# Close tsv file
outfile.close()

# Create pandas dataframe tsv output file
df = pd.read_csv('review_stars.tsv', delimiter ="\t", encoding="utf-8")

print(df)

# One hot encoding for the 4 businesses chosen
encode_text_index(df,"business_id")

print(df)

                business_id  stars  \
0    oiAlXZPIFm2nBCt0DHLu_Q    5.0   
1    oiAlXZPIFm2nBCt0DHLu_Q    1.0   
2    oiAlXZPIFm2nBCt0DHLu_Q    5.0   
3    oiAlXZPIFm2nBCt0DHLu_Q    1.0   
4    oiAlXZPIFm2nBCt0DHLu_Q    5.0   
..                      ...    ...   
543  YZeUH6zYS0dq5QHLYZhUnQ    1.0   
544  YZeUH6zYS0dq5QHLYZhUnQ    1.0   
545  JjcJVqhZXhP4tvOhg3fnag    5.0   
546  JjcJVqhZXhP4tvOhg3fnag    1.0   
547  YZeUH6zYS0dq5QHLYZhUnQ    3.0   

                                                  text  
0    b"I've been coming to this dry cleaner for alm...  
1    b'They lost 2 pairs of my suitpants and told m...  
2    b"I have been going to this dry cleaning since...  
3    b'I\'ve only had my dry cleaning done here twi...  
4    b'After reading the reviews of the cleaners cl...  
..                                                 ...  
543  b'I hate to give a one star, but this place ne...  
544  b'I LOVE HOOTERS! When I came to this particul...  
545  b'Priced out purchasing o

In [23]:
#text pre processing to seperate text and stars for seperate businesses
business_0 = df.loc[df.business_id == 0,['text','stars']]
business_1 = df.loc[df.business_id == 1,['text','stars']]
business_2 = df.loc[df.business_id == 2,['text','stars']]
business_3 = df.loc[df.business_id == 3,['text','stars']]

#create a list of reviews from each business 
business_0_reviews = business_0['text'].to_list()
business_1_reviews = business_0['text'].to_list()
business_2_reviews = business_0['text'].to_list()
business_3_reviews = business_0['text'].to_list()

vectorizer = sk_text.TfidfVectorizer(
                             stop_words='english',
                             max_features = 1000,
                             min_df=1)


#TfIdfVectorizer: transforms text into a "sparse matrix" where rows are text and columns are words, and values are the tf-dif values.
matrix = vectorizer.fit_transform(business_0_reviews)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array

print(vectorizer.get_feature_names())
print(matrix.shape)

#print(business_0)
#print(business_1)
#print(business_2)
#print(business_3)


<class 'scipy.sparse.csr.csr_matrix'>
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.12934847 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
['00', '00pm', '10', '100', '1000', '10am', '11', '11am', '12', '13', '14', '15', '150', '1pm', '1st', '20', '200', '2200', '24', '25', '2nd', '2yr', '30', '300', '30am', '30pm', '3pm', '40', '40ga', '40gal', '45', '4pm', '4th', '50', '50am', '50gal', '55', '55a', '625', '675', '700', '750', '7am', '800', '900', '9a', '9am', 'able', 'absolutely', 'accurate', 'action', 'actually', 'additional', 'address', 'advance', 'advantage', 'advice', 'advised', 'affordable', 'afternoon', 'aging', 'ago', 'agreed', 'air', 'amaz