<a href="https://colab.research.google.com/github/KNL1979/DS807/blob/main/Part2_RNN_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading libraries

In [None]:
!pip install wandb -Uq

In [None]:
import tensorflow as tf

import wandb
from wandb.keras import WandbCallback, WandbMetricsLogger

import pandas as pd
#import requests
#import random
import gzip
import json
import io
import os
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.model_selection import train_test_split

In [None]:
import psutil

In [None]:
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)
psutil.virtual_memory()[2]

RAM Used (GB): 1.058914304


3.0

# Setting up with Google Drive

In [None]:
# Mount your google drive where you've saved your assignment folder
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Replace '------' with the path such that "DM890_assignment_1" is your working directory
%cd '/content/gdrive/My Drive/AML_Exam/Problem2'

/content/gdrive/.shortcut-targets-by-id/1osKmyrDLORFSpes1mwrU-W7OXM-to6bK/AML_Exam/Problem2


# Function to sample the data, and data paths

In [None]:
def dataset_name(file_path):
  dataset_name = os.path.splitext(os.path.basename(file_path))[0].replace('_5.json', '')
  return dataset_name

In [None]:
def loading_gz(file_path, printing = False, load_all = False, max_load_samples = np.inf, max_samples_per_cat = 2000):
  my_keys = ['overall', 'reviewText', 'summary']
  with gzip.open(file_path, 'rt', encoding='utf-8') as g:
          # Initialize an empty list to store sampled rows
          sampled_data = []
          for i, line in enumerate(g):
              if printing:
                if i % 1000 == 0:
                    print(i)
              data = json.loads(line)
              sampled_data.append({k: data[k] for k in my_keys & data.keys()})
              if i > max_load_samples:
                break
          sampled_data = pd.DataFrame(sampled_data)
          sampled_data.dropna(inplace = True)

          if load_all:
            return sampled_data

          nMax = sampled_data.groupby('overall').count().min()[0] * 3
          #my_max = min(nMax, 2000)
          res = sampled_data.groupby('overall').apply(lambda x: x.sample(n = min(nMax, len(x), max_samples_per_cat))).reset_index(drop = True)

          return res.sample(frac=1)


In [None]:
# List of file paths for your 29 datasets
file_paths_train = [
    '/content/gdrive/My Drive/AML_Exam/Problem2/data/Arts_Crafts_and_Sewing_5.json.gz',
    '/content/gdrive/My Drive/AML_Exam/Problem2/data/Prime_Pantry_5.json.gz'
]

path_val  = '/content/gdrive/My Drive/AML_Exam/Problem2/data/All_Beauty_5.json.gz'

path_test = '/content/gdrive/My Drive/AML_Exam/Problem2/data/Luxury_Beauty_5.json.gz'

data_path_dict = {dataset_name(k):k for k in file_paths_train}


# Loading data

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## Defining Encoding function

In [None]:
def my_encoder(x_train, x_test = None, MAX_VOCAB_SIZE = 1000, OUTPUT_LENGTH = None):
  encoder = tf.keras.layers.TextVectorization(max_tokens=MAX_VOCAB_SIZE, output_sequence_length = OUTPUT_LENGTH)
  encoder.adapt(x_train)
  # vocab = np.array(encoder.get_vocabulary())

  encoded_x = encoder(x_train)

  if not pd.isnull(x_test).all():
    encoded_x_test = encoder(x_test)
    return encoded_x, encoded_x_test
  else:
    return encoded_x

In [None]:
data_path_dict

{'Arts_Crafts_and_Sewing': '/content/gdrive/My Drive/AML_Exam/Problem2/data/Arts_Crafts_and_Sewing_5.json.gz',
 'Prime_Pantry': '/content/gdrive/My Drive/AML_Exam/Problem2/data/Prime_Pantry_5.json.gz'}

# Function for builder RNN models

In [None]:
def model_builder(vocab_size, review_length, embedding_size, RNN_LAYERS: list):
    model = tf.keras.models.Sequential()

    model.add(tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_size))

    def my_RNN(rnn_dict):
      if rnn_dict['rnn_type'] == 'LSTM':
        layer = tf.keras.layers.LSTM(rnn_dict['width'], return_sequences = rnn_dict['return_seq'])
      elif rnn_dict['rnn_type'] == 'GRU':
        layer = tf.keras.layers.GRU(rnn_dict['width'], return_sequences = rnn_dict['return_seq'])
      if rnn_dict['bi'] == True:
        return tf.keras.layers.Bidirectional(layer, merge_mode = rnn_dict['bi_merge'])
      else:
        return layer

    for layer in RNN_LAYERS:
      model.add(my_RNN(layer))

    model.add(tf.keras.layers.Dense(5, activation='softmax'))

    return model



In [None]:
RNN_layers_1 = [
    {'rnn_type': 'LSTM',
     'width': 128,
     'return_seq' : True ,
     'bi': True,
     'bi_merge': 'concat'},
    {'rnn_type': 'LSTM',
     'width': 128,
     'return_seq' : True ,
     'bi': True,
     'bi_merge': 'sum'},
    {'rnn_type': 'LSTM',
     'width': 32,
     'return_seq' : False,
     'bi': False
    }]

RNN_layers_2 = [
    {'rnn_type': 'LSTM',
     'width': 128,
     'return_seq' : True ,
     'bi': True,
     'bi_merge': 'sum'},
    {'rnn_type': 'LSTM',
     'width': 128,
     'return_seq' : True ,
     'bi': True,
     'bi_merge': 'concat'},
    {'rnn_type': 'LSTM',
     'width': 32,
     'return_seq' : False,
     'bi': False
    }]


RNN_layers_3 = [
    {'rnn_type': 'LSTM',
     'width': 256,
     'return_seq' : True ,
     'bi': False},
    {'rnn_type': 'LSTM',
     'width': 32,
     'return_seq' : True,
     'bi': False},
    {'rnn_type': 'LSTM',
     'width': 16,
     'return_seq' : False,
     'bi': False
    }]



RNN_layers_4 = [
    {'rnn_type': 'GRU',
     'width': 256,
     'return_seq' : True ,
     'bi': False},
    {'rnn_type': 'GRU',
     'width': 32,
     'return_seq' : True,
     'bi': False},
    {'rnn_type': 'GRU',
     'width': 16,
     'return_seq' : False,
     'bi': False
    }]


RNN_dict = {'model_1': RNN_layers_1, 'model_2': RNN_layers_2, 'model_3': RNN_layers_3, 'model_4': RNN_layers_4}

#RNN_layers = [['LSTM', 200, True , False], ['LSTM', 2, False , False]]
print(model_builder(1500, 100, 128, RNN_dict['model_1']).summary())
print(model_builder(1500, 100, 128, RNN_dict['model_2']).summary())
print(model_builder(1500, 100, 128, RNN_dict['model_3']).summary())
print(model_builder(1500, 100, 128, RNN_dict['model_4']).summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         192000    
                                                                 
 bidirectional (Bidirection  (None, None, 256)         263168    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, None, 128)         394240    
 onal)                                                           
                                                                 
 lstm_2 (LSTM)               (None, 32)                20608     
                                                                 
 dense (Dense)               (None, 5)                 165       
                                                                 
Total params: 870181 (3.32 MB)
Trainable params: 870181 

# WANDB initialization

In [None]:
train_name = 'Prime_Pantry'
load_train = loading_gz(data_path_dict[train_name], max_load_samples = 2000000, max_samples_per_cat=50000)
x_train = load_train['reviewText']
y_train = tf.one_hot(load_train['overall']-1,5) #the minus 1 is to adjust for the output of the model being [0,4]

load_val = loading_gz(path_val, load_all=True)
x_val = load_val['reviewText']
y_val = tf.one_hot(load_val['overall']-1,5) #the minus 1 is to adjust for the output of the model being [0,4]

encoded_train, encoded_val  = my_encoder(x_train, x_val, MAX_VOCAB_SIZE = 1500, OUTPUT_LENGTH = 100)

In [None]:
#wandb.login()
os.environ["WANDB_API_KEY"] = "f1dd10d2eaa5cc353bbe282583f7fc58b9b6c733"
sweep_navn = "ModelTest_4"

In [None]:
# Configure the sweep – specify the parameters to search through, the search strategy, the optimization metric et all.
sweep_config = {
    'name': sweep_navn,
    'method': 'grid', #grid, random
    'metric': {
        'name': 'loss',
        'goal': 'minimize'
    },
    'parameters': {
        #'training_set_name': {'values': list(data_path_dict.keys())},
        'embedding_size': {'values': [32,64,128,256]},
        #'RNN_layer_width':  {'values': [32, 64, 128]},
        'RNN_layer_model': {'values': list(RNN_dict.keys())},
        #'vocab_size': {'values': [500,1000,1500,2000]},
        #'review_length': {'values': [None, 50, 100, 150, 200]},
        #'epochs': {'values': [20]},
        #'batch_size': {'values': [64]},
        #'dropout': {'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]},
        #'learning_rate': {'values': [0.0075]},
        #'optimizer': {'values': ['adam', 'SGD']},
        #'activation': {'values': ['relu']},
    }
}

sweep_id = wandb.sweep(sweep_config, project="RNN_preliminary_tests", entity='coffeegang')

Create sweep with ID: r8db3smm
Sweep URL: https://wandb.ai/coffeegang/RNN_preliminary_tests/sweeps/r8db3smm


## Setting up RNN with gpu

In [None]:
def train_model():
  config_defaults = {
    #'training_set_name': '',
    'vocab_size': 1500,
    'review_length': 100,
    'RNN_layer_model': 'model_1',
    #'embedding_size': 128,
    #'RNN_layer_width': 64,
    #'RNN_layer_type': 'LSTM',
    #'train_size': int(50000/5),
    'epochs': 50,
    'batch_size': 64,
    'learning_rate': 0.0075,
    'optimizer': 'adam',
    #'validation_split': 0.3,
  }

  with tf.device('/gpu:0'):
    # Initialize a new wandb run
    wandb.init(config=config_defaults)

    # Config is a variable that holds and saves hyperparameters and inputs
    config = wandb.config


    model = model_builder(config.vocab_size, config.review_length, config.embedding_size, RNN_dict[config.RNN_layer_model])

    optimizer = tf.keras.optimizers.Adam(learning_rate = config.learning_rate)

    model.compile(
        loss = "categorical_crossentropy",
        optimizer = optimizer,
        metrics = [tf.keras.metrics.CategoricalAccuracy(),
                   tf.keras.metrics.Precision(),
                   tf.keras.metrics.Recall(),
                   tf.keras.metrics.F1Score()
                   ]+[tf.keras.metrics.Precision(class_id= x, name = f"Precision_{x}") for x in range(5)] +[tf.keras.metrics.Recall(class_id= x, name = f"Recall_{x}") for x in range(5)]
                   ,
        )


    wandb.log({"train_set": train_name, "size_train": len(x_train), 'rnn_layer_model': RNN_dict[config.RNN_layer_model]})

    model.fit(encoded_train, y_train,
              epochs = config.epochs,
              batch_size = config.batch_size,
              validation_data=(encoded_val, y_val),
              callbacks=[WandbCallback(save_model = False),
                          tf.keras.callbacks.EarlyStopping(
                              monitor="loss",
                              #min_delta=0.01,
                              patience=3,
                              #mode="max"
                          )])

In [None]:
wandb.agent(sweep_id, train_model)

[34m[1mwandb[0m: Agent Starting Run: tieadk91 with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_1
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: Currently logged in as: [33maksel[0m ([33mcoffeegang[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▂▁▁▆▆▆▇▇▇▇▇▇▇████████████
Precision_1,▁▁▁▁▁▁▁▁▁▄▆▇▇▇▇▇▇████████
Precision_2,▁▁▁▅▆▆▇▇▇▇▇▇█████████████
Precision_3,▁▁▆▆▆▆▇▇▇▇▇▇▇▇███████████
Precision_4,▁▅▇▇▇▇▇▇▇▇▇██████████████
Recall_0,▁▁▁▂▄▄▅▅▆▆▆▇▇▇▇▇▇████████
Recall_1,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▄▅▆▆▇▇▇█▇▇▆
Recall_2,▁▁▁▁▃▅▅▆▆▆▇▇▇▇▇▇▇████████
Recall_3,▁▁▁▂▂▃▅▅▆▆▆▇▇▇▇▇▇▇███████
Recall_4,▁▂▆▆▆▇▇▇▇▇▇▇█████████████

0,1
Precision_0,0.77623
Precision_1,0.56427
Precision_2,0.70265
Precision_3,0.6522
Precision_4,0.76282
Recall_0,0.59859
Recall_1,0.36003
Recall_2,0.63398
Recall_3,0.5345
Recall_4,0.70307


[34m[1mwandb[0m: Agent Starting Run: h8hs2l3m with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_1
[34m[1mwandb[0m: 	embedding_size: 64


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▁▁▁▁▁▁▁▁
Precision_1,▁▁▁▁▁▁▁▁▁▁
Precision_2,▁▁▁▁▁▁▁▁▁▁
Precision_3,█▁▁▁▁▁▁▁▁▁
Precision_4,▁▁▁▁▁▁▁▁▁▁
Recall_0,▁▁▁▁▁▁▁▁▁▁
Recall_1,▁▁▁▁▁▁▁▁▁▁
Recall_2,▁▁▁▁▁▁▁▁▁▁
Recall_3,█▁▁▁▁▁▁▁▁▁
Recall_4,▁▁▁▁▁▁▁▁▁▁

0,1
Precision_0,0.0
Precision_1,0.0
Precision_2,0.0
Precision_3,0.0
Precision_4,0.0
Recall_0,0.0
Recall_1,0.0
Recall_2,0.0
Recall_3,0.0
Recall_4,0.0


[34m[1mwandb[0m: Agent Starting Run: kw3ubbj8 with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_1
[34m[1mwandb[0m: 	embedding_size: 128


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▁▁▁▁█▆██▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆
Precision_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▃█▄▅▅▄▆▅▆▅▄▅▅
Precision_2,▁▁▁▇▇█▇▇█▇▇▇▇██████████████████
Precision_3,▁▁▁▇▇▇▇▇▇▇█████████████████████
Precision_4,▁▆▇████▇███████████████████████
Recall_0,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▄▅▆▆▆▇▇▇█▇▇▇███▇
Recall_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▃▃▂▃▄▅██▇█
Recall_2,▁▁▁▁▂▂▂▂▂▄▅▅▆▆▇▇▇▇█████████████
Recall_3,▁▁▁▁▂▁▃▂▃▄▅▆▆▆▇▇▇▇▇▇▇▇▇███▇███▇
Recall_4,▁▁▆▆▆▆▇▇▇▇▇▇▇██████████████████

0,1
Precision_0,0.66237
Precision_1,0.58242
Precision_2,0.61943
Precision_3,0.56089
Precision_4,0.69246
Recall_0,0.31621
Recall_1,0.01449
Recall_2,0.40325
Recall_3,0.31026
Recall_4,0.59541


[34m[1mwandb[0m: Agent Starting Run: ylm2jq6r with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_1
[34m[1mwandb[0m: 	embedding_size: 256


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50


VBox(children=(Label(value='0.003 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.07769954342923154, max=1.…

0,1
Precision_0,▁▁▁▁▁▁▁▁▆▇▇█▇███████████████
Precision_1,▁▁▁▁▁▁▁▁▁▁▁▁▅██▁▆▆▄▅▅▆▅▆▅▅▅▅
Precision_2,▁▁█▅▁▁▄▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
Precision_3,▁▅▅▄▁▁▁█▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
Precision_4,▃▁▁▁▅█▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
Recall_0,▁▁▁▁▁▁▁▁▁▄▅▅▆▆▆▆▇▇▇▇▇▇▇█████
Recall_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▃▃▂▄▂▄▄▇▅█▇▇
Recall_2,▁▁▁▁▁▁▁▁▃▆▇▇▇▇▇▇▇███████████
Recall_3,▁▁▁▁▁▁▁▁▂▄▅▆▆▇▇▇▇▇██▇████▇▇▇
Recall_4,▁▁▁▁▁▁▆▇▇▇██████████████████

0,1
Precision_0,0.67736
Precision_1,0.58537
Precision_2,0.6127
Precision_3,0.55077
Precision_4,0.68044
Recall_0,0.31836
Recall_1,0.00656
Recall_2,0.36866
Recall_3,0.2714
Recall_4,0.59254


[34m[1mwandb[0m: Agent Starting Run: yv6d1ja0 with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_2
[34m[1mwandb[0m: 	embedding_size: 32


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▁▁▁
Precision_1,▁▁▁▁▁
Precision_2,▁▁▁▁▁
Precision_3,▁█▁▁▁
Precision_4,▁▁▁▁▁
Recall_0,▁▁▁▁▁
Recall_1,▁▁▁▁▁
Recall_2,▁▁▁▁▁
Recall_3,▁█▁▁▁
Recall_4,▁▁▁▁▁

0,1
Precision_0,0.0
Precision_1,0.0
Precision_2,0.0
Precision_3,0.0
Precision_4,0.0
Recall_0,0.0
Recall_1,0.0
Recall_2,0.0
Recall_3,0.0
Recall_4,0.0


[34m[1mwandb[0m: Agent Starting Run: vtnhd6nt with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_2
[34m[1mwandb[0m: 	embedding_size: 64


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▇▇▇▇█▇█▇▇███████▇██████████
Precision_1,▁▁▁▁▁▁▁▁▁▁▁▆█▆▅▆▅▆▆█▆▆▅▇▆▆▅▆▅▆▆▆▅▅▆▆▆▅▆▆
Precision_2,▁▁▁▁▁▁▁▇▇▇▇▇▇▇██████████████████████████
Precision_3,▁▁▁▁▁▁▁▁▆▇▇▇▇▇▇▇▇███████████████████████
Precision_4,▁▁▁▁▁▁▁▇████████████████████████████████
Recall_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▆▆▆▇▆▇▇▇▇▇▇▇▇█▇▇███████▇█
Recall_1,▁▁▁▁▁▁▁▁▁▁▁▂▂▃▃▅▃▄▅▃▄▄▃▃▅▅▆▅▆▅▄▅▅▆▆▆▇█▆▆
Recall_2,▁▁▁▁▁▁▁▁▄▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇██▇███████
Recall_3,▁▁▁▁▁▁▁▁▁▁▃▄▅▅▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████▇
Recall_4,▁▁▁▁▁▁▁▇▇▇▇▇▇▇█▇▇███████████████████████

0,1
Precision_0,0.69593
Precision_1,0.77143
Precision_2,0.64612
Precision_3,0.55749
Precision_4,0.7033
Recall_0,0.43156
Recall_1,0.00738
Recall_2,0.41566
Recall_3,0.37189
Recall_4,0.61684


[34m[1mwandb[0m: Agent Starting Run: a94f5eex with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_2
[34m[1mwandb[0m: 	embedding_size: 128


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,█▁▁▁▁▁▁▁
Precision_1,▁▁▁▁▁▁▁▁
Precision_2,█▁▁▁▁▁▁▁
Precision_3,▁▁█▇▁▁▁▁
Precision_4,▁▁▁▁▁▁▁▁
Recall_0,█▁▁▁▁▁▁▁
Recall_1,▁▁▁▁▁▁▁▁
Recall_2,█▁▁▁▁▁▁▁
Recall_3,▁▁█▆▁▁▁▁
Recall_4,▁▁▁▁▁▁▁▁

0,1
Precision_0,0.0
Precision_1,0.0
Precision_2,0.0
Precision_3,0.0
Precision_4,0.0
Recall_0,0.0
Recall_1,0.0
Recall_2,0.0
Recall_3,0.0
Recall_4,0.0


[34m[1mwandb[0m: Agent Starting Run: l2lkoelh with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_2
[34m[1mwandb[0m: 	embedding_size: 256


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50


VBox(children=(Label(value='0.003 MB of 0.038 MB uploaded\r'), FloatProgress(value=0.07087127768997266, max=1.…

0,1
Precision_0,▁▁▁▁▁▇▇▇█▇███████████████████████
Precision_1,▁▁▁▁▁▁▁▁▁▁▁▆█▁▆▇▅▆█▇▆█▆▇▇▆▆▆▇▇▆▆▇
Precision_2,▁▁▁▆▇▇▇▇█████████████████████████
Precision_3,▁▁▁█▇▇▇▇▇████████████████████████
Precision_4,▁▁▇▇█████████████████████████████
Recall_0,▁▁▁▁▁▁▄▅▆▇▆▇▇▇▇▇█▇██████████████▇
Recall_1,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▃▂▃▂▂▃▃▃▃▄▅▅▅▅█▄
Recall_2,▁▁▁▁▁▄▆▆▆▇▇▇▇▇▇▇▇██▇█▇███████████
Recall_3,▁▁▁▁▂▄▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇█▇▇▇██████▇
Recall_4,▁▁▁▆▇▇▇▇▇████████████████████████

0,1
Precision_0,0.69826
Precision_1,0.66667
Precision_2,0.62662
Precision_3,0.57861
Precision_4,0.69603
Recall_0,0.33313
Recall_1,0.01367
Recall_2,0.43642
Recall_3,0.33056
Recall_4,0.59541


[34m[1mwandb[0m: Agent Starting Run: 4q29k3ah with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_3
[34m[1mwandb[0m: 	embedding_size: 32


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▁▁▁▁▁▁▁
Precision_1,▁▁▁▁▁▁▁▁▁
Precision_2,▁▁▁▁▁▁▁▁▁
Precision_3,▁▁▁▁▁▁▁▁▁
Precision_4,▁▁▁▁█████
Recall_0,▁▁▁▁▁▁▁▁▁
Recall_1,▁▁▁▁▁▁▁▁▁
Recall_2,▁▁▁▁▁▁▁▁▁
Recall_3,▁▁▁▁▁▁▁▁▁
Recall_4,▁▁▁▁▄████

0,1
Precision_0,0.0
Precision_1,0.0
Precision_2,0.0
Precision_3,0.0
Precision_4,0.54682
Recall_0,0.0
Recall_1,0.0
Recall_2,0.0
Recall_3,0.0
Recall_4,0.31672


[34m[1mwandb[0m: Agent Starting Run: 1lybarc5 with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_3
[34m[1mwandb[0m: 	embedding_size: 64


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▁▁▁▁
Precision_1,▁▁▁▁▁▁
Precision_2,▁▁▁▁▁▁
Precision_3,▁▁▁▁▁▁
Precision_4,▁▁▁▁▁▁
Recall_0,▁▁▁▁▁▁
Recall_1,▁▁▁▁▁▁
Recall_2,▁▁▁▁▁▁
Recall_3,▁▁▁▁▁▁
Recall_4,▁▁▁▁▁▁

0,1
Precision_0,0.0
Precision_1,0.0
Precision_2,0.0
Precision_3,0.0
Precision_4,0.0
Recall_0,0.0
Recall_1,0.0
Recall_2,0.0
Recall_3,0.0
Recall_4,0.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5dx11tgl with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_3
[34m[1mwandb[0m: 	embedding_size: 128


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


VBox(children=(Label(value='0.003 MB of 0.024 MB uploaded\r'), FloatProgress(value=0.10955991495165884, max=1.…

0,1
Precision_0,▁▁▁▁▁▁▁▁▁▁▁▁▁
Precision_1,▁▁▁▁▁▁▁▁▁▁▁▁▁
Precision_2,▁▁▁▁▁▁▁▁▁▁▁▁▁
Precision_3,▁▁▁▁▁▁▁▁▁▁▁▁▁
Precision_4,▁▁▁▁▁▁▁▁▁▁▁▁▁
Recall_0,▁▁▁▁▁▁▁▁▁▁▁▁▁
Recall_1,▁▁▁▁▁▁▁▁▁▁▁▁▁
Recall_2,▁▁▁▁▁▁▁▁▁▁▁▁▁
Recall_3,▁▁▁▁▁▁▁▁▁▁▁▁▁
Recall_4,▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Precision_0,0.0
Precision_1,0.0
Precision_2,0.0
Precision_3,0.0
Precision_4,0.0
Recall_0,0.0
Recall_1,0.0
Recall_2,0.0
Recall_3,0.0
Recall_4,0.0


[34m[1mwandb[0m: Agent Starting Run: 7wzmy3w9 with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_3
[34m[1mwandb[0m: 	embedding_size: 256


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▁▁▁
Precision_1,▁▁▁▁▁
Precision_2,▁▁▁▁▁
Precision_3,▁▁▁▁▁
Precision_4,▁▁▁▁▁
Recall_0,▁▁▁▁▁
Recall_1,▁▁▁▁▁
Recall_2,▁▁▁▁▁
Recall_3,▁▁▁▁▁
Recall_4,▁▁▁▁▁

0,1
Precision_0,0.0
Precision_1,0.0
Precision_2,0.0
Precision_3,0.0
Precision_4,0.0
Recall_0,0.0
Recall_1,0.0
Recall_2,0.0
Recall_3,0.0
Recall_4,0.0


[34m[1mwandb[0m: Agent Starting Run: 51nhrvkr with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_4
[34m[1mwandb[0m: 	embedding_size: 32


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▁▁▆▇▇▇█▇██████▇█████▇█▇██▇
Precision_1,▁▁▁▁▁▁▁▁▆▁▁▁▂▅█▄▁▆▁▅█▅█▆▆▆▆▁
Precision_2,▄▁▅▆▄▆▄▅▆▅▄▅▆▅▅▅▆▅▅█▇▆▇▅▆▆▆▆
Precision_3,▁▆▅▇█▇█▇█▇██████████████████
Precision_4,▁▄▆▆▇▇▇▇██▇█▇▇█▇▇█▆▆▇█▇▇██▇█
Recall_0,▁▁▁▁▁▃▄▅▅▄▅▅▆▆▆▆▇▇▇▇▇▇▇█▇███
Recall_1,▁▁▁▁▁▁▁▁▂▁▁▁▂▃▂▂▁█▁▃▂▂▂▂▂▃▂▁
Recall_2,▁▂▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████▇
Recall_3,▁▁▁▂▃▄▄▅▄▅▆▆▆▅▇▇▇▇▆▇▇▆██▇█▇█
Recall_4,▁▅▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇███▇██████

0,1
Precision_0,0.70463
Precision_1,0.0
Precision_2,0.59183
Precision_3,0.52268
Precision_4,0.65899
Recall_0,0.11227
Recall_1,0.0
Recall_2,0.1734
Recall_3,0.15359
Recall_4,0.4696


[34m[1mwandb[0m: Agent Starting Run: 5xncfxyk with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_4
[34m[1mwandb[0m: 	embedding_size: 64


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁█▅▅▆▆▅
Precision_1,▁▁▁▁█▁▁▁
Precision_2,▁▆▇█████
Precision_3,▁▆▆▇█▇▇▇
Precision_4,▁▃▆██▇█▆
Recall_0,▁▁▁▂▆██▅
Recall_1,▁▁▁▁█▁▁▁
Recall_2,▁▁▃▆███▇
Recall_3,▁▂▄▆█▇▇▅
Recall_4,▁▆▇███▇▆

0,1
Precision_0,0.59898
Precision_1,0.0
Precision_2,0.5849
Precision_3,0.51791
Precision_4,0.65217
Recall_0,0.10889
Recall_1,0.0
Recall_2,0.21865
Recall_3,0.12899
Recall_4,0.44889


[34m[1mwandb[0m: Agent Starting Run: g2v2o3m1 with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_4
[34m[1mwandb[0m: 	embedding_size: 128


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▁▇▇▇▇▇████
Precision_1,▁▁▁▁▁▁▁▁▁▁▁
Precision_2,▁██████████
Precision_3,█▁▂▃▄▄▄▅▄▅▆
Precision_4,▁▅▆█▇█▇████
Recall_0,▁▁▂▃▄▄▅▆▇▇█
Recall_1,▁▁▁▁▁▁▁▁▁▁▁
Recall_2,▁▃▄▅▆▆██▇▆▆
Recall_3,▁▅▆▇▇███▇▇▇
Recall_4,▁▅▆▇▇▇██▇▇▇

0,1
Precision_0,0.74138
Precision_1,0.0
Precision_2,0.57952
Precision_3,0.54158
Precision_4,0.64311
Recall_0,0.10581
Recall_1,0.0
Recall_2,0.13486
Recall_3,0.10417
Recall_4,0.3928


[34m[1mwandb[0m: Agent Starting Run: eusbqdba with config:
[34m[1mwandb[0m: 	RNN_layer_model: model_4
[34m[1mwandb[0m: 	embedding_size: 256


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Precision_0,▁▇▇██████
Precision_1,▁▁▁▁▁█▄▁▁
Precision_2,▆▁▄▆▆▇██▆
Precision_3,▁█▇▇███▆▇
Precision_4,▁▄▆▇█▇█▇▅
Recall_0,▁▁▄▆█▇█▇▅
Recall_1,▁▁▁▁▁▅█▁▁
Recall_2,▁▄▆████▇▆
Recall_3,▁▄▅▇███▆▅
Recall_4,▁▆▇████▇▆

0,1
Precision_0,0.67121
Precision_1,0.0
Precision_2,0.58606
Precision_3,0.52661
Precision_4,0.65052
Recall_0,0.15134
Recall_1,0.0
Recall_2,0.21239
Recall_3,0.11566
Recall_4,0.45176


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# Final test

In [None]:
train_name = 'Prime_Pantry'
load_train = loading_gz(data_path_dict[train_name], max_load_samples = 2000000, max_samples_per_cat=50000)
x_train = load_train['reviewText']
y_train = tf.one_hot(load_train['overall']-1,5) #the minus 1 is to adjust for the output of the model being [0,4]

load_val = loading_gz(path_val, load_all=True)
x_val = load_val['reviewText']
y_val = tf.one_hot(load_val['overall']-1,5) #the minus 1 is to adjust for the output of the model being [0,4]

encoded_train, encoded_val  = my_encoder(x_train, x_val, MAX_VOCAB_SIZE = 1500, OUTPUT_LENGTH = 100)

In [None]:
load_test = loading_gz(path_test, load_all=True)
x_test = load_test['reviewText']
y_test = load_test['overall']-1 #the minus 1 is to adjust for the output of the model being [0,4]
_, encoded_test  = my_encoder(x_train, x_test, MAX_VOCAB_SIZE = 1500, OUTPUT_LENGTH = 100)

In [None]:
#wandb.login()
os.environ["WANDB_API_KEY"] = "f1dd10d2eaa5cc353bbe282583f7fc58b9b6c733"

In [None]:
config_defaults = {
    #'training_set_name': '',
    'vocab_size': 1500,
    'review_length': 100,
    'RNN_layer_model': 'model_1',
    'embedding_size': 32,
    #'RNN_layer_width': 64,
    #'RNN_layer_type': 'LSTM',
    #'train_size': int(50000/5),
    'epochs': 100,
    'batch_size': 64,
    'learning_rate': 0.0075,
    'optimizer': 'adam',
    #'validation_split': 0.3,
  }

In [None]:
wandb.init(config=config_defaults, project="RNN_preliminary_tests", entity='coffeegang', name='final_1')

# Config is a variable that holds and saves hyperparameters and inputs
config = wandb.config


model = model_builder(config.vocab_size, config.review_length, config.embedding_size, RNN_dict[config.RNN_layer_model])

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate = config.learning_rate)

model.compile(
    loss = "categorical_crossentropy",
    optimizer = optimizer,
    metrics = [tf.keras.metrics.CategoricalAccuracy(),
                tf.keras.metrics.Precision(),
                tf.keras.metrics.Recall(),
                tf.keras.metrics.F1Score()]
    )


wandb.log({"train_set": train_name, "size_train": len(x_train), 'rnn_layer_model': RNN_dict[config.RNN_layer_model]})

model.fit(encoded_train, y_train,
          epochs = config.epochs,
          batch_size = config.batch_size,
          validation_data = [encoded_val, y_val],
          callbacks=[WandbCallback(save_model = False)])

In [None]:
pred = model.predict(encoded_test)

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, np.argmax(pred,axis=1))).plot()

In [None]:
print(classification_report(y_test, np.argmax(pred,axis=1)))

In [None]:
y_val_sparse = load_val['overall']-1
pred_val = model.predict(encoded_val)
ConfusionMatrixDisplay(confusion_matrix(y_val_sparse, np.argmax(pred_val,axis=1))).plot()
print(classification_report(y_val_sparse, np.argmax(pred_val,axis=1)))