In [1]:
import json 
import csv 
import pandas as pd 
import sklearn as sk
import numpy as np
import io
from collections.abc import Sequence
import shutil
import os
import matplotlib.pyplot as plt
from sklearn import preprocessing
import sklearn.feature_extraction.text as sk_text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf



In [7]:


#Little GEMS functions
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)



# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low



outfile = open("business.tsv", 'w') 
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL) 
sfile.writerow(['business_id','stars', 'review_count']) 

with open('yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8', errors='ignore') as f: 
    i=0
    for line in f:  
        row = json.loads(line)
        if(row['review_count']>=20):
            i+=1
            sfile.writerow([row['business_id'], row['stars'], (row['review_count'])]) 
outfile.close() 
#loading data into dataframe using pandas
business = pd.read_csv('business.tsv',delimiter ="\t", encoding="utf-8")

#opens the files review stars and gets ready to implement
outfile = open("review_stars.tsv", 'w') 
#this will determine the outfile contents
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL) 
#The table coloumns
sfile.writerow(['business_id','stars', 'text']) 

with open('yelp_dataset/yelp_academic_dataset_review.json', encoding='utf-8', errors='ignore') as f: 
    i = 0
    for line in f:
        if( i < 300):
            i+=1
            row = json.loads(line)
            sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')]) 
            
outfile.close() 

#loading data into dataframe using pandas
reviews = pd.read_csv('review_stars.tsv', delimiter ="\t", encoding="utf-8")
#print(business.head())
#print(reviews.head())
business = pd.merge(business,reviews, on=['business_id','stars'])
business = business.drop(columns = 'review_count')
print(business)



df_review_agg = business.groupby('business_id')['text'].sum()  

df_ready_for_sklearn = pd.DataFrame({'business_id': df_review_agg.index, 'all_reviews': 
df_review_agg.values}) 

#lab 4
vectorizer = sk_text.TfidfVectorizer(
                             stop_words='english',
                             max_features = 1000,
                             max_df = 500,
                             min_df=2)

df_all_reviews = vectorizer.fit_transform(df_ready_for_sklearn['all_reviews'])
tfidf_data = df_all_reviews.toarray()
#print("Check this out! {}".format(tfidf_data.shape))
#print(vectorizer.get_feature_names())

#to xy x is vectorized data while y is stars
print(tfidf_data)

               business_id  stars  \
0   lyhNDfX8UatlRO5H3Kfccg    5.0   
1   ZfKHogPGqQpzgNFSFjfICw    5.0   
2   HxegWRjhi7m73mXRI8qQIg    4.0   
3   EmJOkTKIwgm7QJbls7hN3w    4.0   
4   bZiIIUcpgxh8mpKMDhdqbA    4.0   
5   vRrDTIW9IFBO4cc3laazUw    4.0   
6   SSdONoTe5UCJjfJ7ZsoMQg    4.0   
7   Oj5Seggqo_2FfKSjcAg7yw    3.0   
8   FQALxQa69loeORgzbIBBmw    3.0   
9   _sS2LBIGNT5NQb6PD1Vtjw    5.0   
10  PEnMU_He_qHoCfdoAKmjDQ    4.0   
11  W4BJPLCjghJy3_wXjUtQLQ    5.0   
12  Un6u2cECyV4nZb_HGZ-uTA    4.0   
13  xaTGgwLwFGopzr1VlpBuBw    4.0   
14  _gOz7-aHMyGUHOtjDrEv2w    2.0   
15  TA1KUSCu8GkWP9w0rmElxw    4.0   
16  TA1KUSCu8GkWP9w0rmElxw    4.0   
17  Irp5sgl7XASH5ZTw2D47qw    4.0   
18  bPmWDBkjBhV11Yk4BipG4Q    4.0   
19  J2NLhn_nMK4zeksEr6EE6Q    4.0   
20  g4CP3kgH1jTtMn-joxPT0A    4.0   
21  Pisoftb2bzA6OkqwppOTTA    2.0   
22  zmZ3HkVCeZPBefJJxzdJ7A    4.0   
23  bOkLeien1ra8x-7R9E8iYQ    3.0   
24  75HV-KqCtn_oHeiLiGlO_w    4.0   
25  i9BDFBYcl_PGqrLbQUdMvg    4.0   
2

In [8]:

#to_xy accepts a dataframe
#create a new df

final_df = pd.concat([pd.DataFrame(tfidf_data),business['stars']],axis = 1)
x,y = to_xy(final_df,'stars')
print(x)
print(y)


[[0.       0.       0.       ... 0.       0.       0.      ]
 [0.       0.       0.       ... 0.       0.       0.      ]
 [0.       0.       0.       ... 0.       0.       0.      ]
 ...
 [0.       0.       0.       ... 0.       0.211353 0.      ]
 [     nan      nan      nan ...      nan      nan      nan]
 [     nan      nan      nan ...      nan      nan      nan]]
[5. 5. 4. 4. 4. 4. 4. 3. 3. 5. 4. 5. 4. 4. 2. 4. 4. 4. 4. 4. 4. 2. 4. 3.
 4. 4. 4. 4. 4. 4. 5. 4. 4. 3. 4. 4. 4. 5. 3. 4. 4. 4. 4. 4. 3. 5. 5. 5.
 5. 4. 4.]


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], activation='relu'))
model.add(Dense(1))


model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')  


model.fit(x_train, y_train, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=100)  

pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

# print out prediction
print('Using relu and the optimizer adam')
df_y = pd.DataFrame(y_test, columns=['actual stars'])
df_pred = pd.DataFrame(pred, columns=['predicted stars'])
result = pd.concat([df_y, df_pred],axis=1)
result

Epoch 1/100
2/2 - 0s - loss: nan - val_loss: nan
Epoch 2/100
2/2 - 0s - loss: nan - val_loss: nan
Epoch 00002: early stopping


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], activation='sigmoid'))
model.add(Dense(1))


model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')  


model.fit(x_train, y_train, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=100)  
#supress statement
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

# print out prediction
# print out prediction
print('Using sigmoid and the optimizer adam')
df_y = pd.DataFrame(y_test, columns=['actual stars'])
df_pred = pd.DataFrame(pred, columns=['predicted stars'])
result = pd.concat([df_y, df_pred],axis=1)
result

Epoch 1/100
1/1 - 0s - loss: 18.3727 - val_loss: 8.0474
Epoch 2/100
1/1 - 0s - loss: 18.2922 - val_loss: 7.9967
Epoch 3/100
1/1 - 0s - loss: 18.2120 - val_loss: 7.9463
Epoch 4/100
1/1 - 0s - loss: 18.1318 - val_loss: 7.8960
Epoch 5/100
1/1 - 0s - loss: 18.0519 - val_loss: 7.8460
Epoch 6/100
1/1 - 0s - loss: 17.9722 - val_loss: 7.7961
Epoch 7/100
1/1 - 0s - loss: 17.8927 - val_loss: 7.7464
Epoch 8/100
1/1 - 0s - loss: 17.8133 - val_loss: 7.6969
Epoch 9/100
1/1 - 0s - loss: 17.7342 - val_loss: 7.6476
Epoch 10/100
1/1 - 0s - loss: 17.6553 - val_loss: 7.5985
Epoch 11/100
1/1 - 0s - loss: 17.5766 - val_loss: 7.5496
Epoch 12/100
1/1 - 0s - loss: 17.4981 - val_loss: 7.5009
Epoch 13/100
1/1 - 0s - loss: 17.4198 - val_loss: 7.4524
Epoch 14/100
1/1 - 0s - loss: 17.3417 - val_loss: 7.4041
Epoch 15/100
1/1 - 0s - loss: 17.2638 - val_loss: 7.3560
Epoch 16/100
1/1 - 0s - loss: 17.1862 - val_loss: 7.3081
Epoch 17/100
1/1 - 0s - loss: 17.1088 - val_loss: 7.2604
Epoch 18/100
1/1 - 0s - loss: 17.0316 - 

Unnamed: 0,actual stars,predicted stars
0,2.0,1.153195
1,4.0,1.287784


In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], activation='tanh'))
model.add(Dense(1))


model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')  


model.fit(x_train, y_train, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=100) 
#supress statement
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

# print out prediction
print('Using tanh and the optimizer adam')
df_y = pd.DataFrame(y_test, columns=['actual stars'])
df_pred = pd.DataFrame(pred, columns=['predicted stars'])
result = pd.concat([df_y, df_pred],axis=1)
result

Epoch 1/100
1/1 - 0s - loss: 16.3532 - val_loss: 24.0708
Epoch 2/100
1/1 - 0s - loss: 16.1968 - val_loss: 23.8851
Epoch 3/100
1/1 - 0s - loss: 16.0411 - val_loss: 23.7012
Epoch 4/100
1/1 - 0s - loss: 15.8861 - val_loss: 23.5184
Epoch 5/100
1/1 - 0s - loss: 15.7316 - val_loss: 23.3363
Epoch 6/100
1/1 - 0s - loss: 15.5779 - val_loss: 23.1549
Epoch 7/100
1/1 - 0s - loss: 15.4248 - val_loss: 22.9742
Epoch 8/100
1/1 - 0s - loss: 15.2723 - val_loss: 22.7940
Epoch 9/100
1/1 - 0s - loss: 15.1206 - val_loss: 22.6145
Epoch 10/100
1/1 - 0s - loss: 14.9696 - val_loss: 22.4357
Epoch 11/100
1/1 - 0s - loss: 14.8193 - val_loss: 22.2575
Epoch 12/100
1/1 - 0s - loss: 14.6697 - val_loss: 22.0801
Epoch 13/100
1/1 - 0s - loss: 14.5208 - val_loss: 21.9032
Epoch 14/100
1/1 - 0s - loss: 14.3727 - val_loss: 21.7271
Epoch 15/100
1/1 - 0s - loss: 14.2254 - val_loss: 21.5517
Epoch 16/100
1/1 - 0s - loss: 14.0788 - val_loss: 21.3770
Epoch 17/100
1/1 - 0s - loss: 13.9330 - val_loss: 21.2030
Epoch 18/100
1/1 - 0s -

Unnamed: 0,actual stars,predicted stars
0,5.0,1.078498
1,4.0,1.941893
