In [2]:
import json
import csv
import pandas as pd
from sklearn import preprocessing
import numpy as np
import os
from collections.abc import Sequence
import matplotlib.pyplot as plt
import sklearn.feature_extraction.text as sk_text

import tensorflow.keras 

from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from collections import Counter
from datetime import datetime


In [3]:
### HELPER FUNCTIONS ###

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

In [4]:
#  tsv output file for cleaned up business json dataset
outfile = open("businesses.tsv", 'w')
businessfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL) 
businessfile.writerow(['business_id','name', 'stars', 'category'])

# Opens json dataset from path. 
with open('../yelp_dataset/yelp_academic_dataset_business.json', encoding="utf-8") as f: 
    for line in f:
        row = json.loads(line)
        # Only getting businesses with review count over 20
        if row['review_count'] > 20:
            # some special char must be encoded in 'utf-8' 
            businessfile.writerow([row['business_id'], row['name'], row['stars'], row['categories']])

# Close tsv file
outfile.close()

# Create pandas dataframe tsv output file
df = pd.read_csv('businesses.tsv', delimiter ="\t", encoding="utf-8")

In [5]:
# Hashset for business_ids of businesses used for project
# Will use this to search for the businesses chosen for project in the reviews json dataset
# and add it to reviews tsv
chosen_business = {'YZeUH6zYS0dq5QHLYZhUnQ', 'oiAlXZPIFm2nBCt0DHLu_Q', 'fNil19SUfPAPnLQrYnFrGQ', 'JjcJVqhZXhP4tvOhg3fnag'}


#  tsv output file for cleaned up review json dataset
outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL) 
sfile.writerow(['business_id','stars', 'text'])

# Opens json dataset from path. 
with open('../yelp_dataset/yelp_academic_dataset_review.json', encoding="utf-8") as f: 
    for line in f:
        row = json.loads(line)
        # If statement to look for our chosen businesses
        if row['business_id'] in chosen_business: 
            # some special char must be encoded in 'utf-8' 
            sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

# Close tsv file
outfile.close()

# Create pandas dataframe tsv output file
df = pd.read_csv('review_stars.tsv', delimiter ="\t", encoding="utf-8")

print(df)

# One hot encoding for the 4 businesses chosen
encode_text_index(df,"business_id")

print(df)

                business_id  stars  \
0    oiAlXZPIFm2nBCt0DHLu_Q    5.0   
1    oiAlXZPIFm2nBCt0DHLu_Q    1.0   
2    oiAlXZPIFm2nBCt0DHLu_Q    5.0   
3    oiAlXZPIFm2nBCt0DHLu_Q    1.0   
4    oiAlXZPIFm2nBCt0DHLu_Q    5.0   
..                      ...    ...   
543  YZeUH6zYS0dq5QHLYZhUnQ    1.0   
544  YZeUH6zYS0dq5QHLYZhUnQ    1.0   
545  JjcJVqhZXhP4tvOhg3fnag    5.0   
546  JjcJVqhZXhP4tvOhg3fnag    1.0   
547  YZeUH6zYS0dq5QHLYZhUnQ    3.0   

                                                  text  
0    b"I've been coming to this dry cleaner for alm...  
1    b'They lost 2 pairs of my suitpants and told m...  
2    b"I have been going to this dry cleaning since...  
3    b'I\'ve only had my dry cleaning done here twi...  
4    b'After reading the reviews of the cleaners cl...  
..                                                 ...  
543  b'I hate to give a one star, but this place ne...  
544  b'I LOVE HOOTERS! When I came to this particul...  
545  b'Priced out purchasing o

In [52]:

#WE NEED TO DO THIS IN THE CSV FILE
#then send that into tensor flow
#!!!!!!
texts = df['text'].to_list()
stars = df['stars']

#apply one hot endocing on stars
stars = pd.get_dummies(stars, columns = ['stars'])
#print(texts)        
print(stars)



     1.0  2.0  3.0  4.0  5.0
0      0    0    0    0    1
1      1    0    0    0    0
2      0    0    0    0    1
3      1    0    0    0    0
4      0    0    0    0    1
..   ...  ...  ...  ...  ...
543    1    0    0    0    0
544    1    0    0    0    0
545    0    0    0    0    1
546    1    0    0    0    0
547    0    0    1    0    0

[548 rows x 5 columns]


In [43]:

#WE NEED TO DO THIS IN THE CSV FILE
#then send that into tensor flow
#!!!!!!

vectorizer = sk_text.TfidfVectorizer(
                             stop_words='english',
                             max_features = 1000,
                             min_df=1)
matrix = vectorizer.fit_transform(texts)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array

print(vectorizer.get_feature_names())
print(matrix.shape)
    


<class 'scipy.sparse.csr.csr_matrix'>
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['00', '10', '100', '10am', '11', '12', '13', '14', '15', '18', '1pm', '20', '24', '25', '2nd', '30', '30am', '40', '45', '4pm', '50', '700', '90', '99', '9am', 'able', 'absolutely', 'accommodate', 'accurate', 'actually', 'additional', 'advice', 'advised', 'affordable', 'afternoon', 'ago', 'agreed', 'allowed', 'alter', 'alterations', 'altered', 'amazing', 'anal', 'animal', 'animals', 'answer', 'answered', 'anymore', 'anyways', 'apologized', 'appetizer', 'appointment', 'appreciate', 'appreciated', 'area', 'areas', 'aren', 'arrived', 'ask', 'asked', 'asking', 'assistants', 'ate', 'attentive', 'attitude', 'available', 'average', 'away', 'awesome', 'awful', 'babies', 'baby', 'bad', 'bar', 'based', 'bcbg', 'beer', 'beers', 'believe', 'bentley', 'best', 'better', 'big', 'bit', 'blonde', 'blood', 'boar

In [53]:


x,y = to_xy(df,"stars")


ValueError: could not convert string to float: 'b"I\'ve been coming to this dry cleaner for almost 7 years. They are the best in town. I\'ve tried others... thought that I could find better... I can\'t! Great service and quality work! I also manage a luxury clothing store and I bring my stores garments there as well! I bring in simple alterations and all my dry cleaning/laundry needs!\\n\\nI am a customer for life!"'

In [None]:
model = Sequential()

model.add(Dense(25, input_dim=x.shape[1], activation='relu')) # Hidden 1     #  why input_dim=x.shape[1]?  
model.add(Dense(10, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output

model.compile(loss='mean_squared_error', optimizer='adam')

model.fit(x,y,verbose=2,epochs=100)    # Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.