<a href="https://colab.research.google.com/github/IyadKhuder/NLP_BoW_TensorFlowHub/blob/main/NLP_BoW_TensorFlowHub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TensorFlow Hub - Text classification

- Based on: https://www.tensorflow.org/hub/tutorials/tf2_text_classification

https://playground.tensorflow.org/


# Importing the libraries

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
print('TensorFlow version: ', tf.__version__)
print('TensorFlow Hub version: ', hub.__version__)

TensorFlow version:  2.9.2
TensorFlow Hub version:  0.12.0


### a) *Importing Google-Drive relevant libraries*

In [2]:
# libraries for the files in google drive
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

### b) *Connecting to Google-Drive and importing the dataset file*

In [3]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

download = drive.CreateFile({'id': '1CIAW4Ji9q1L8UUt2rnFkJt6qx3QlknyZ'})

# Download the file to a local disc
download.GetContentFile('Restaurant_Reviews.tsv')


### c) *Storing the dataset in a dataframe and verifying that we access it*

In [4]:
import pandas as pd

In [6]:
df  = pd.read_csv("Restaurant_Reviews.tsv", delimiter = '\t', quoting = 3)
print(df)

                                                Review  Liked
0                             Wow... Loved this place.      1
1                                   Crust is not good.      0
2            Not tasty and the texture was just nasty.      0
3    Stopped by during the late May bank holiday of...      1
4    The selection on the menu was great and so wer...      1
..                                                 ...    ...
995  I think food should have flavor and texture an...      0
996                           Appetite instantly gone.      0
997  Overall I was not impressed and would not go b...      0
998  The whole experience was underwhelming, and I ...      0
999  Then, as if I hadn't wasted enough of my life ...      0

[1000 rows x 2 columns]


### d) *Extracting the input and output fields*

In [7]:
X= df.iloc[:,0:-1].values
y= df.iloc[:,-1].values

In [8]:
X.shape

(1000, 1)

In [43]:
# Set the train:test percetage
# Percentage = 80% => size(X_train) = 800
n = 800

In [44]:
X_train = X[:n,:]
y_train = y[:n]
y_test = y[n:]
X_test = X[n:,:]

In [45]:
X_train.shape

(800, 1)

In [46]:
y_train.shape

(800,)

In [47]:
y_train = y_train.reshape(len(y_train),1 )
y_train.shape

(800, 1)

In [48]:
y_test.shape

(200,)

In [49]:
y_test = y_test.reshape(len(y_test),1 )
y_test.shape

(200, 1)

In [50]:
X_test.shape

(200, 1)

# Building and training the neural network

In [71]:
model_path = 'https://tfhub.dev/google/nnlm-en-dim50/2'

In [72]:
embedding_layer = hub.KerasLayer(model_path, input_shape = [], dtype = tf.string, trainable = True)

In [20]:
# input = tf.keras.layers.Input(shape=(), name="Input", dtype=tf.string)

In [73]:
embedding_layer(X_train[1,])

<tf.Tensor: shape=(1, 50), dtype=float32, numpy=
array([[ 0.14294183,  0.01486275,  0.01177659, -0.07886052,  0.11969235,
        -0.17994426,  0.24438062,  0.00459703, -0.12754944,  0.06723507,
        -0.09139995, -0.18797436, -0.03747191, -0.0242891 , -0.07398539,
         0.3253423 , -0.13087204, -0.11668985, -0.01679479, -0.45974058,
         0.16189638, -0.17808953, -0.00983213,  0.10641059, -0.20094255,
         0.01631247,  0.03566115, -0.03228062, -0.13913496, -0.01942133,
        -0.07761522,  0.30196995, -0.11239656, -0.09970577,  0.00156803,
         0.04806294,  0.04358976,  0.06202801,  0.15414013,  0.12325947,
         0.25816405,  0.07419002, -0.17233726, -0.0213831 , -0.08867244,
         0.16050503, -0.02066359, -0.12074912, -0.03338655,  0.0470865 ]],
      dtype=float32)>

In [114]:
model = tf.keras.Sequential()
model.add(embedding_layer)
# 50 -> 16 -> 1
model.add(tf.keras.layers.Dense(units = 10, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 6, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 1))
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_4 (KerasLayer)  (None, 50)                48190600  
                                                                 
 dense_16 (Dense)            (None, 10)                510       
                                                                 
 dense_17 (Dense)            (None, 6)                 66        
                                                                 
 dense_18 (Dense)            (None, 1)                 7         
                                                                 
Total params: 48,191,183
Trainable params: 48,191,183
Non-trainable params: 0
_________________________________________________________________


In [115]:
model.compile(optimizer = 'adam', loss = tf.losses.BinaryCrossentropy(from_logits = True), metrics = ['accuracy'])

In [116]:
model.fit(X_train, y_train, epochs = 100, batch_size = 512, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f3a37062790>

In [127]:
results = model.evaluate(X_test, y_test)
print(results)

[0.6057019233703613, 0.7400000095367432]


# Predictions

In [128]:
X_test[0:10]

array([["I'm super pissd."],
       ['And service was super friendly.'],
       ['Why are these sad little vegetables so overcooked?'],
       ['This place was such a nice surprise!'],
       ['They were golden-crispy and delicious.'],
       ['I had high hopes for this place since the burgers are cooked over a charcoal grill, but unfortunately the taste fell flat, way flat.'],
       ['I could eat their bruschetta all day it is devine.'],
       ['Not a single employee came out to see if we were OK or even needed a water refill once they finally served us our food.'],
       ['Lastly, the mozzarella sticks, they were the best thing we ordered.'],
       ['The first time I ever came here I had an amazing experience, I still tell people how awesome the duck was.']],
      dtype=object)

In [129]:
y_test[0:10]

array([[0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1]])

In [130]:
predictions = model.predict(X_test)



In [131]:
predictions[:10]

array([[ 2.5012014 ],
       [-0.94403076],
       [-2.424668  ],
       [ 3.3297837 ],
       [ 3.0377452 ],
       [ 2.3909338 ],
       [ 2.8554568 ],
       [-4.250577  ],
       [ 3.6557002 ],
       [ 2.7922373 ]], dtype=float32)

In [132]:
predictions = tf.nn.sigmoid(predictions).numpy()
predictions[:10]

array([[0.924226  ],
       [0.28008685],
       [0.08131088],
       [0.9654366 ],
       [0.9542505 ],
       [0.91613334],
       [0.94560003],
       [0.01405563],
       [0.9748077 ],
       [0.9422549 ]], dtype=float32)

In [133]:
predictions = (predictions >= 0.5)
predictions[:10]

array([[ True],
       [False],
       [False],
       [ True],
       [ True],
       [ True],
       [ True],
       [False],
       [ True],
       [ True]])

## Making the Confusion Matrix

In [134]:
y_pred = predictions.astype(int)

In [135]:
y_pred[:10]

array([[1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1]])

In [136]:
y_test[0:10]

array([[0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1]])

In [137]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[105  47]
 [  8  40]]


0.725

Obviously, the accuracy score is still low. However, this repo is just an demo. It needs to be improved. 
For example, the "K-fold Cross Validation" wasn't applied here, which would cause the model to be subject to overfitting the training set.