## Data Fetching

In [None]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import re
import string
from nltk.corpus import stopwords
import nltk
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
nltk.download('stopwords')
from tensorflow.keras import layers
import pickle
from tensorflow.keras.utils import Sequence
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Nadam, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from time import time


from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip

--2021-08-01 09:20:50--  https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip
Resolving s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)... 52.219.128.146
Connecting to s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)|52.219.128.146|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1061576029 (1012M) [binary/octet-stream]
Saving to: ‘dataset52a7b21.zip’


2021-08-01 09:21:51 (16.7 MB/s) - ‘dataset52a7b21.zip’ saved [1061576029/1061576029]



In [None]:
!unzip dataset52a7b21.zip

Archive:  dataset52a7b21.zip
   creating: dataset/
  inflating: dataset/train.csv       
  inflating: dataset/sample_submission.csv  
  inflating: dataset/test.csv        
  inflating: dataset/.~lock.train.csv#  


In [None]:
df = pd.read_csv("dataset/train.csv", escapechar = "\\",quoting = csv.QUOTE_NONE, engine="python")
df

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero
...,...,...,...,...,...
110770,110771,AAHNA E MALL OneBlade Hybrid Trimmer Shaver An...,<p>1-All In One Hyper Advanced Smart Rechargea...,"[Unique One Blade can style, trim and shave, w...",Generic
110771,110772,Grin Health N99 Anti Pollution Reusable Washab...,"<p>SIZE GUIDE : M - (35- 65 Kg), L - (49- 72 K...",[PROTECTION: Filtration rate up to ≥99 percent...,Grin Health
110772,110773,Asian Army Pink Ultra reusable respirator clot...,Asian HyperProtect A95 masks have been enginee...,[Reusable and environment friendly: These mask...,ASIAN
110773,110774,IM Safe 3 Ply Non-Woven Disposable Surgical Fa...,This 3 Ply Disposable face mask is manufacture...,[3 Ply Mask: Genuine 3 Ply Mask. 25 GSM Spun B...,Intermarket


In [None]:
label_counts = df.BROWSE_NODE_ID.value_counts()
label_counts

1045       215698
5           70318
1251        51929
1052        45553
4           34177
            ...  
279112          1
822060          1
1337233         1
2136811         1
248936          1
Name: BROWSE_NODE_ID, Length: 9919, dtype: int64

In [None]:
label_counts.count()

9919

In [None]:
def get_percentile(label_counts, top_n=[1, 10, 50, 60, 70, 100, 200, 300, 500, 1000, 1500, 2000, 9000], total=2903024):
  for n in top_n:
    n_large = label_counts.nlargest(n)
    subset = n_large.values.sum()
    print(f"Percentile {n}:", "%.2f" % (subset / total * 100), "%")

In [None]:
# pick top n labels for faster training (although, this notebook can handle full dataset easily)

get_percentile(label_counts)

Percentile 1: 7.43 %
Percentile 10: 19.48 %
Percentile 50: 37.20 %
Percentile 60: 39.90 %
Percentile 70: 42.10 %
Percentile 100: 47.58 %
Percentile 200: 58.34 %
Percentile 300: 64.67 %
Percentile 500: 72.82 %
Percentile 1000: 83.36 %
Percentile 1500: 88.84 %
Percentile 2000: 92.09 %
Percentile 9000: 99.94 %


In [None]:
stopwords = set(stopwords.words('english'))

## Data cleaning

In [None]:
html_tags = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
valid_chars = list(string.ascii_lowercase + string.digits + ' ')

def clean_title(text):
  try:
    text = text.strip() # remove spaces at ends
    text = text.lower() 
    text = re.sub(html_tags, '', text) # remove html
    text = re.sub(r'\d+', '', text) # remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation)) # remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords])  # remove stopwords
    text = ' '.join(word for word in text.split() if word.isalnum())  # remove any shitty charater
    return text
  except Exception as e:
    print(e)
    return np.nan

def clean_invalid(text): # helps removeing different fonts
    words = text.split()
    for word in words:
      for char in list(word):
        if char not in valid_chars:
          return np.nan
    return text
  except Exception as e:
    print(e)
    return np.nan

def clean_brand(text): 
  try:
    text = text.strip()
    text = text.lower()
    text = re.sub(html_tags, '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation)) 
    words = text.split()
    for word in words:
      for char in list(word):
        if char not in valid_chars:
          return np.nan
    return text
  except:
    return np.nan

In [None]:
df["TITLE"] = df["TITLE"].apply(clean_title)
df["DESCRIPTION"] = df["DESCRIPTION"].apply(clean_title)
df["BULLET_POINTS"] = df["BULLET_POINTS"].apply(clean_title)
df["BRAND"] = df["BRAND"].apply(clean_title)

In [None]:
df

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,command small kitchen hooks white decorate dam...,sale unit pack,includes hooks small indoor strips hook holds ...,command
1,2,oneal jump hardware jag unisexadult glove blac...,synthetic leather palm doublelayer thumb reinf...,silicone printing better grip long lasting fle...,oneal
2,3,nfl detroit lions portable party fridge quart,boelter brands lets celebrate favorite popcult...,runs volt dc power volt ac power plugs home ou...,boelter brands
3,4,panasonic single line kxtsmx corded phone white,features station phonebook corded phone alphan...,panasonic landline phones doesnt come manufact...,panasonic
4,5,zero baby girls cotton innerwear bloomer drawe...,zero baby girl panties set cotton breathable s...,zero baby girl panties pack cotton baby girlss...,zero
...,...,...,...,...,...
110770,110771,aahna e mall oneblade hybrid trimmer shaver ed...,one hyper advanced smart rechargeable razor pr...,unique one blade style trim shave keeping skin...,generic
110771,110772,grin health n anti pollution reusable washable...,size guide kg l kg xl kg new n anti pollution ...,protection filtration rate percent small pm us...,grin health
110772,110773,asian army pink ultra reusable respirator clot...,asian hyperprotect masks engineered layer trip...,reusable environment friendly masks washable r...,asian
110773,110774,im safe ply nonwoven disposable surgical face ...,ply disposable face mask manufactured using si...,ply mask genuine ply mask gsm spun bonded nonw...,intermarket


In [None]:
# remove rows with all columns empty
df = df.drop(df[(df["TITLE"].isna() & df["DESCRIPTION"].isna() & df["BULLET_POINTS"].isna())].index)

# reset indexes after removing rows
df.index = pd.RangeIndex(len(df.index))
df.index = range(len(df.index))
del df["Unnamed: 0"]

In [None]:
# Fill nan values by interpolating

for idx in range(len(df)):
  if pd.isnull(df["TITLE"][idx]):
    if not pd.isnull(df["DESCRIPTION"][idx]):
      df["TITLE"][idx] = df["DESCRIPTION"][idx]
    elif  not pd.isnull(df["BULLET_POINTS"][idx]):
      df["TITLE"][idx] = df["BULLET_POINTS"][idx]
    else:
      print("Error interpolating")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
desc_na = df[df["DESCRIPTION"].isna()].index
df["DESCRIPTION"][desc_na] = df["TITLE"][desc_na]

In [None]:
bull_na = df[df["BULLET_POINTS"].isna()].index
df["BULLET_POINTS"][bull_na] = df["DESCRIPTION"][bull_na]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# replace nan in brands by some fixed token (can be any)
df['BRAND'] = df['BRAND'].fillna("INVADLID")

In [None]:
df

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,pete cat bedtime blues doll inch,pete cat coolest popular cat town new pete cat...,pete cat bedtime blues plush dollbased popular...,MerryMakers,0
1,new yorker nyhm refrigerator magnet x,new yorker handsome cello wrapped hard magnet ...,cat tea cup new yorker cover artist gurbuz dog...,The New Yorker,1
2,ultimate selfsufficiency handbook complete gui...,ultimate selfsufficiency handbook complete gui...,skyhorse publishing,imusti,2
3,amway nutrilite kids chewable iron tablets,amway nutrilite kids chewable iron tablets,nutrilite kidschewable iron tabletsquantity ta...,Amway,3
4,teacher planner company lesson academic teache...,teacher planner company lesson academic teache...,teacher planner company lesson academic teache...,jatt,4
...,...,...,...,...,...
2902950,premium aviator sunglasses hd polarized bright...,premium aviator sunglasses color options craft...,frame size lens height mm lens width mm nosebr...,Generic,1040
2902951,social distance stickers set sticker slip resi...,set prints social distancing sticker self adhe...,covid safety sticker set maintain crowd contro...,Generic,15199
2902952,torrto face shield pack adjustable elastic str...,complete face protection torrto face shield ef...,microns pack pcscomplete face protection torrt...,TORR-TO,1044933
2902953,typec mm oppo r pro typec mm audio jack adapte...,still want use favorite earphonesheadphones le...,indian connectors made indian sockets wall cha...,SHOPBELL,14790


In [None]:
# Save to disk for reuse

df.to_csv("cleaned_final.csv")
!cp "cleaned_final.csv" "/content/drive/MyDrive/HackerEarth/cleaned_final.csv"

## Data vectorization


In [None]:
df

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,pete cat bedtime blues doll inch,pete cat coolest popular cat town new pete cat...,pete cat bedtime blues plush dollbased popular...,merrymakers,0
1,new yorker nyhm refrigerator magnet x,new yorker handsome cello wrapped hard magnet ...,cat tea cup new yorker cover artist gurbuz dog...,the new yorker,1
2,ultimate selfsufficiency handbook complete gui...,ultimate selfsufficiency handbook complete gui...,skyhorse publishing,imusti,2
3,amway nutrilite kids chewable iron tablets,amway nutrilite kids chewable iron tablets,nutrilite kidschewable iron tabletsquantity ta...,amway,3
4,teacher planner company lesson academic teache...,teacher planner company lesson academic teache...,teacher planner company lesson academic teache...,jatt,4
...,...,...,...,...,...
2902950,premium aviator sunglasses hd polarized bright...,premium aviator sunglasses color options craft...,frame size lens height mm lens width mm nosebr...,generic,1040
2902951,social distance stickers set sticker slip resi...,set prints social distancing sticker self adhe...,covid safety sticker set maintain crowd contro...,generic,15199
2902952,torrto face shield pack adjustable elastic str...,complete face protection torrto face shield ef...,microns pack pcscomplete face protection torrt...,torrto,1044933
2902953,typec mm oppo r pro typec mm audio jack adapte...,still want use favorite earphonesheadphones le...,indian connectors made indian sockets wall cha...,shopbell,14790


In [None]:
# Assign tokens to words (this was not used due to large size in this notebook)
# We use character level models instead of words.

vectorize_layer.adapt(df["DESCRIPTION"].values[:len(df)//2])
vectorize_layer.adapt(df["DESCRIPTION"].values[len(df)//2:])

In [None]:
vectorize_layer.adapt(df["TITLE"].values[:len(df)//2])
vectorize_layer.adapt(df["TITLE"].values[len(df)//2:])

In [None]:
vectorize_layer.adapt(df["BULLET_POINTS"].values[:len(df)//2])
vectorize_layer.adapt(df["BULLET_POINTS"].values[len(df)//2:])

In [None]:
pickle.dump({'config': vectorize_layer.get_config(),
             'weights': vectorize_layer.get_weights()}
            , open("tv_layer_all.pkl", "wb"))

In [None]:
!cp "/content/tv_layer_all.pkl" "/content/drive/MyDrive/HackerEarth/tv_layer_all.pkl"

## Load data vectorizer

In [None]:
!cp "/content/drive/MyDrive/HackerEarth/tv_layer_all.pkl" "/content/tv_layer_all.pkl"

def get_char_vectorizer():
  valid_chars = list(string.ascii_lowercase + string.digits + ' ')
  vectorize_layer = TextVectorization(output_mode='int', 
                                    output_sequence_length=4096)
  vectorize_layer.adapt(valid_chars)
  return vectorize_layer

def get_word_vectorizer():
  from_disk = pickle.load(open("tv_layer_all.pkl", "rb"))
  vectorize_layer = TextVectorization.from_config(from_disk['config'])
  vectorize_layer.set_weights(from_disk['weights'])
  return vectorize_layer

## Model Creation

In [None]:
# Fully custom model, each column has custom embedding layer whose
# features are processed and concatenated at the end.

max_features = 40
embedding_dim = 128 

def get_char_model(n):
  input_title = keras.Input(shape=(4096,), dtype="int64")
  input_desc = keras.Input(shape=(4096,), dtype="int64")
  input_bullets = keras.Input(shape=(4096,), dtype="int64")
  input_brand = keras.Input(shape=(128,), dtype="int64")

  embedding_title = layers.Embedding(max_features, embedding_dim)(input_title)
  embedding_desc = layers.Embedding(max_features, embedding_dim)(input_desc)
  embedding_bullets = layers.Embedding(max_features, embedding_dim)(input_bullets)
  embedding_brand = layers.Embedding(max_features, embedding_dim//8)(input_brand)

  title_drop = layers.Dropout(0.5)(embedding_title)
  desc_drop = layers.Dropout(0.5)(embedding_desc)
  bullets_drop = layers.Dropout(0.5)(embedding_bullets)
  brand_drop = layers.Dropout(0.5)(embedding_brand)

  title_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=1)(title_drop)
  title_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(title_conv1)
  title_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(title_conv1)
  title_conv1 = layers.GlobalMaxPooling1D()(title_conv1)

  desc_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=1)(desc_drop)
  desc_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(desc_conv1)
  desc_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(desc_conv1)
  desc_conv1 = layers.GlobalMaxPooling1D()(desc_conv1)

  bullets_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=1)(bullets_drop)
  bullets_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(bullets_conv1)
  bullets_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(bullets_conv1)
  bullets_conv1 = layers.GlobalMaxPooling1D()(bullets_conv1)

  brand_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=1)(brand_drop)
  brand_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(brand_conv1)
  brand_conv1 = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(brand_conv1)
  brand_conv1 = layers.GlobalMaxPooling1D()(brand_conv1)

  concat = layers.Concatenate()([title_conv1, desc_conv1, bullets_conv1, brand_conv1])
  dense = layers.Dense(2048, activation="relu")(concat)
  dense = layers.Dropout(0.5)(dense)

  predictions = layers.Dense(n, activation="softmax", name="predictions")(dense)

  model = tf.keras.Model(inputs=[input_title, input_desc, input_bullets, input_brand], 
                    outputs=[predictions])
  
  model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
  
  return model

In [None]:
model = get_char_model(1500)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 128)]        0                                            
______________________________________________________________________________________________

In [None]:
# split train and validation data

def get_train_test(df, n):
  label_counts = df.BROWSE_NODE_ID.value_counts()
  n_large = label_counts.nlargest(n)
  new_df = df[df["BROWSE_NODE_ID"].isin(n_large.index)]
  new_df.reset_index(drop=True, inplace=True)
  train, test = train_test_split(new_df, test_size=0.0001)
  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)

  le = preprocessing.LabelEncoder()
  le.fit(new_df["BROWSE_NODE_ID"].values)

  return train, test, le

train, test, le = get_train_test(df, 1500)

## Make data generators

In [None]:
# Custom Data geerator to use batching to avoid out of memory errors

class DataGeneratorWord(Sequence):
    def __init__(self, df, to_fit, le, batch_size=64, shuffle=False, n=1500, augmentation_rate=0.2):
      
        self.df = df
        self.le = le
        self.n = n
        self.to_fit = to_fit
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.epoch = 0
        self.on_epoch_end()
        self.vectorizer = get_char_vectorizer()
        self.augmentation_rate = augmentation_rate
    
    def __len__(self):
        return int(len(self.df) // self.batch_size)

    def add_spaces(self, sentence):
      mod = ""
      for char in sentence:
        mod += char + " "
      return mod[:-1]

    def augment(self, text):
      if np.random.random() < self.augmentation_rate:
        text = ""
      return text


    def __getitem__(self, index):
        # Generate indexes of the batch
        current_indexes = list(range(index * self.batch_size, (index + 1) * self.batch_size))
        data_points = self.df.loc[current_indexes].values

        # Generate data
        X = []
        y = np.zeros([self.batch_size, self.n])

        titles = []
        descs = []
        bullets = []
        brands = []

        idx = 0
        for title, desc, bullet, brand, label in data_points:
          titles.append(self.add_spaces(self.augment(title)))
          descs.append(self.add_spaces(self.augment(desc)))
          bullets.append(self.add_spaces(self.augment(bullet)))
          brands.append(self.add_spaces(self.augment(brand)))

          y[idx, self.le.transform([label])] = 1
          idx += 1

        if self.to_fit:
            return [self.vectorizer(titles), self.vectorizer(descs),
                    self.vectorizer(bullets), self.vectorizer(brands)[:, :128]], y
        else:
            return [self.vectorizer(titles), self.vectorizer(descs),
                    self.vectorizer(bullets), self.vectorizer(brands)[:, :128]]

    def on_epoch_end(self):
        if self.shuffle == True:
            indices = np.arange(len(self.df))
            np.random.shuffle(indices)
            self.df = self.df.loc[indices]
        self.epoch += 1

In [None]:
# Assign weight labels according to class frequency distribution 

from sklearn.utils.class_weight import compute_class_weight
class_wts = compute_class_weight('balanced', np.unique(le.classes_), train["BROWSE_NODE_ID"])

weight_dict = {}
for label, weight in zip(le.transform(le.classes_), class_wts):
  weight_dict[label] = weight
print(weight_dict)

{0: 0.5689990984899707, 1: 1.0357534358974358, 2: 5.054352352352352, 3: 0.05007286863217604, 4: 0.024111098377407863, 5: 0.45244605734767024, 6: 6.351318238993711, 7: 2.613508281573499, 8: 0.08480371508708286, 9: 5.783846506300114, 10: 2.4934804938271604, 11: 1.0893846817691477, 12: 5.98967734282325, 13: 1.2117345812335014, 14: 0.18849807742561692, 15: 0.9923934748427673, 16: 6.983814661134163, 17: 5.482408251900108, 18: 3.869193869731801, 19: 3.4070836707152496, 20: 0.4912724265421288, 21: 0.8238371675640398, 22: 1.1761700442580947, 23: 0.5306113913408995, 24: 0.11395391559467388, 25: 4.950292156862745, 26: 1.8136846264367816, 27: 6.705575033200531, 28: 0.9192241034043328, 29: 0.15015606506676182, 30: 0.7143885116015846, 31: 6.652566534914361, 32: 4.795154795821462, 33: 1.2039337148307105, 34: 4.906995140913509, 35: 0.376195648934585, 36: 0.22519391668896618, 37: 1.158361550814407, 38: 0.594944974667138, 39: 0.7232915055149692, 40: 0.30039252781248144, 41: 5.1787671794871795, 42: 0.92

In [None]:
train_gen = DataGeneratorWord(train, True, le, shuffle=False)
val_gen = DataGeneratorWord(test, False, le, shuffle=False)

In [None]:
# Test data generators

X, y = train_gen.__getitem__(0)
val_gen.__getitem__(0)

[<tf.Tensor: shape=(64, 4096), dtype=int64, numpy=
 array([[ 7, 14, 19, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [27, 27,  9, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [25, 23, 10, ...,  0,  0,  0],
        [ 8, 20, 26, ...,  0,  0,  0]])>,
 <tf.Tensor: shape=(64, 4096), dtype=int64, numpy=
 array([[ 0,  0,  0, ...,  0,  0,  0],
        [ 9, 20, 16, ...,  0,  0,  0],
        [25, 27, 10, ...,  0,  0,  0],
        ...,
        [16, 27, 15, ...,  0,  0,  0],
        [12, 26, 10, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]])>,
 <tf.Tensor: shape=(64, 4096), dtype=int64, numpy=
 array([[ 0,  0,  0, ...,  0,  0,  0],
        [17,  7, 10, ...,  0,  0,  0],
        [20, 27,  8, ...,  0,  0,  0],
        ...,
        [22, 13,  7, ...,  0,  0,  0],
        [14, 13,  8, ...,  0,  0,  0],
        [23,  4,  8, ...,  0,  0,  0]])>,
 <tf.Tensor: shape=(64, 128), dtype=int64, numpy=
 array([[ 7, 14, 19, ...,  0,  0,  0],
        

## Model training

In [None]:
#optimizer = Nadam(lr=1e-6, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)
tensorboard = TensorBoard(log_dir="./drive/My Drive/HackerEarth/Models/logs".format(time()), histogram_freq=1, write_graph=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

filepath = "./drive/My Drive/HackerEarth/Models/" + "model-{epoch:03d}-{val_loss:.5f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False)
callbacks_list = [checkpoint, tensorboard]

In [None]:
NUM_EPOCHS = 50

history = model.fit_generator(train_gen, epochs=NUM_EPOCHS, 
                              shuffle=True, callbacks=callbacks_list,
                              validation_data= val_gen)

## Reload Training

In [None]:
from tensorflow.keras.models import load_model

model = load_model('/content/drive/MyDrive/HackerEarth/Models/model-001-0.43.hdf5')
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
NUM_EPOCHS = 2

history = model.fit_generator(train_gen, epochs=NUM_EPOCHS, 
                              shuffle=True, callbacks=callbacks_list,
                              validation_data= val_gen)



Epoch 1/2

## Testing

In [None]:
!cp "/content/drive/MyDrive/HackerEarth/test_clean.csv" "test_clean.csv"

In [None]:
test_set = pd.read_csv("test_clean.csv")
del test_set["Unnamed: 0"]
test_set = test_set.fillna("")

In [None]:
test_set

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,command small kitchen hooks white decorate dam...,sale unit pack,includes hooks small indoor strips hook holds ...,command
1,2,oneal jump hardware jag unisexadult glove blac...,synthetic leather palm doublelayer thumb reinf...,silicone printing better grip long lasting fle...,oneal
2,3,nfl detroit lions portable party fridge quart,boelter brands lets celebrate favorite popcult...,runs volt dc power volt ac power plugs home ou...,boelter brands
3,4,panasonic single line kxtsmx corded phone white,features station phonebook corded phone alphan...,panasonic landline phones doesnt come manufact...,panasonic
4,5,zero baby girls cotton innerwear bloomer drawe...,zero baby girl panties set cotton breathable s...,zero baby girl panties pack cotton baby girlss...,zero
...,...,...,...,...,...
110770,110771,aahna e mall oneblade hybrid trimmer shaver ed...,one hyper advanced smart rechargeable razor pr...,unique one blade style trim shave keeping skin...,generic
110771,110772,grin health n anti pollution reusable washable...,size guide kg l kg xl kg new n anti pollution ...,protection filtration rate percent small pm us...,grin health
110772,110773,asian army pink ultra reusable respirator clot...,asian hyperprotect masks engineered layer trip...,reusable environment friendly masks washable r...,asian
110773,110774,im safe ply nonwoven disposable surgical face ...,ply disposable face mask manufactured using si...,ply mask genuine ply mask gsm spun bonded nonw...,intermarket


In [None]:
class TestDataGeneratorWord(Sequence):
    def __init__(self, df, batch_size=75, n=1500):
      
        self.df = df
        self.n = n
        self.batch_size = batch_size
        self.epoch = 0
        self.on_epoch_end()
        self.vectorizer = get_char_vectorizer()
    
    def __len__(self):
        return int(len(self.df) // self.batch_size)

    def add_spaces(self, sentence):
      mod = ""
      for char in sentence:
        mod += char + " "
      return mod[:-1]

    def __getitem__(self, index):
        # Generate indexes of the batch
        current_indexes = list(range(index * self.batch_size, (index + 1) * self.batch_size))
        data_points = self.df.loc[current_indexes].values

        # Generate data
        X = []

        titles = []
        descs = []
        bullets = []
        brands = []
        ids = []

        idx = 0
        for _, title, desc, bullet, brand in data_points:
          titles.append(self.add_spaces(title))
          descs.append(self.add_spaces(desc))
          bullets.append(self.add_spaces(bullet))
          brands.append(self.add_spaces(brand))
          ids.append(_)

        return ids, [self.vectorizer(titles), self.vectorizer(descs),
                self.vectorizer(bullets), self.vectorizer(brands)[:, :128]]

    def on_epoch_end(self):
        self.epoch += 1

In [None]:
test_gen = TestDataGeneratorWord(test_set)

In [None]:
submission = {"PRODUCT_ID":[],
              "BROWSE_NODE_ID":[]}

for idx in range(test_gen.__len__()):
  if idx % 500 == 0:
    print(idx / test_gen.__len__() * 100, "Done")
  ids, input = test_gen.__getitem__(idx)
  res = model.predict(input)
  decoded = le.inverse_transform(res.argmax(axis=1))
  submission["PRODUCT_ID"] += ids
  submission["BROWSE_NODE_ID"] += decoded.tolist()

In [None]:
submission_df = pd.DataFrame(submission)
submission_df

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,110753,4501
1,110754,709
2,110755,7462
3,110756,1287
4,110757,532
5,110758,840
6,110759,1922
7,110760,4
8,110761,800
9,110762,800


In [None]:
submission_df.to_csv("submission.csv", index=False)