In [1]:
#!unzip /content/drive/MyDrive/MyShares/amazon_food_reviews/amazonfoodreviews.zip -d /content/drive/MyDrive/MyShares/amazon_food_reviews

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import os
import time

In [3]:
data = pd.read_csv('/content/drive/MyDrive/MyShares/amazon_food_reviews/Reviews.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [4]:
# we only need just two columns the text as feature and score as the label

data = data[['Text' , 'Score']]
data = data.sample(frac=1 , random_state=42).reset_index(drop=True)

In [5]:
# we will take 10% of the data as test
test_split_size = int(len(data['Text']) * 0.1)
test_data = data.iloc[:test_split_size , :]
train_data = data.iloc[test_split_size: , :]
print(test_data.head())

                                                Text  Score
0  Having tried a couple of other brands of glute...      5
1  My cat loves these treats. If ever I can't fin...      5
2  A little less than I expected.  It tends to ha...      3
3  First there was Frosted Mini-Wheats, in origin...      2
4  and I want to congratulate the graphic artist ...      5


In [6]:
def process_data(dataF, num_samples = 100):
  dataF = dataF[:num_samples]
  texts = dataF['Text'].tolist()
  texts = [str(i).encode('ascii' , 'replace') for i in texts]
  texts = np.array(texts , dtype=object)[:]

  labels = dataF['Score'].tolist()
  labels = [1 if i>=4 else 0 if i==3 else -1 for i in labels]
  labels = np.array(pd.get_dummies(labels) , dtype=int)[:]

  return texts , labels

In [7]:
texts , labels = process_data(train_data)

print(len(texts))
print(texts.shape)

100
(100,)


#building the classification model using tf hub

In [8]:
def get_model():
  hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2",
                           input_shape=[], dtype=tf.string , trainable = False , name = 'input')
  
  model = tf.keras.Sequential()
  model.add(hub_layer)
  model.add(tf.keras.layers.Dense(16 , activation = 'relu'))
  model.add(tf.keras.layers.Dense(3 , activation = 'softmax', name = 'out'))
  
  model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

  model.summary()

  return model


In [9]:
model = get_model()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (KerasLayer)           (None, 50)                48190600  
_________________________________________________________________
dense (Dense)                (None, 16)                816       
_________________________________________________________________
out (Dense)                  (None, 3)                 51        
Total params: 48,191,467
Trainable params: 867
Non-trainable params: 48,190,600
_________________________________________________________________


In [10]:
embed = hub.load("https://tfhub.dev/google/nnlm-en-dim50/2")
embeddings = embed(["cat is on the mat", "dog is in the fog"])
embeddings





<tf.Tensor: shape=(2, 50), dtype=float32, numpy=
array([[ 0.16589954,  0.0254965 ,  0.1574857 ,  0.17688066,  0.02911299,
        -0.03092718,  0.19445257, -0.05709129, -0.08631689, -0.04391516,
         0.13032274,  0.10905275, -0.08515751,  0.01056632, -0.17220995,
        -0.17925954,  0.19556305,  0.0802278 , -0.03247919, -0.49176937,
        -0.07767699, -0.03160921, -0.13952136,  0.05959712,  0.06858718,
         0.22386682, -0.16653948,  0.19412343, -0.05491862,  0.10997339,
        -0.15811177, -0.02576607, -0.07910853, -0.258499  , -0.04206644,
        -0.20052543,  0.1705603 , -0.15314153,  0.0039225 , -0.28694248,
         0.02468278,  0.11069503,  0.03733957,  0.01433943, -0.11048374,
         0.11931834, -0.11552787, -0.11110869,  0.02384969, -0.07074881],
       [ 0.1437864 ,  0.08291595,  0.10897306,  0.04464385, -0.03630389,
        -0.12605834,  0.20263346,  0.12862863, -0.07873426, -0.01195358,
         0.0020956 , -0.03080653, -0.08019945, -0.18797135, -0.11973457,
 

# train and export the model

In [11]:
x_train , y_train = process_data(train_data , num_samples=100000)

x_val , y_val = process_data(test_data , num_samples=10000)

In [12]:
def train():

  model = get_model()

  model.fit(x_train , y_train , verbose = 1 ,
            epochs = 5 ,
            batch_size=128 ,
            validation_data=(x_val , y_val),
            callbacks = [tf.keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/MyShares/amazon_food_reviews/model_ckp',
                        monitor = 'val_loss' , save_best_model = True,
                        save_weights_only = False , verbose = 1 , mode= 'min')]
            )
  return model

In [None]:

def export_model(model , base_path ='/content/drive/MyDrive/MyShares/amazon_food_reviews'):
  model_path = os.path.join(base_path , str(int(time.time())))
  tf.saved_model.save(model , model_path)
   
# this code will run only in the main script , mean will not work when this script is imported
if __name__ == '__main__':
  model = train()
  export_model(model)


#test the model

In [14]:
sents = ['terrifying match','awesome movie' , 'happy holiday']
for sent in sents:
  print(model.predict([sent]))

[[0.47613546 0.10083809 0.42302647]]
[[0.03824788 0.02393877 0.93781334]]
[[0.01130668 0.00770449 0.9809888 ]]
