# Embedding Layer Training Example v1

## Loading Data

In [1]:
import numpy as np
import pandas as pd 

In [2]:
#df_spells = pd.read_csv("/kaggle/input/dndspells/dnd-spells.csv")
df_spells = pd.read_csv("dataset/dnd-spells.csv")
print("Spells " + str(len(df_spells)))
df_spells[:5]

Spells 554


Unnamed: 0,name,classes,level,school,cast_time,range,duration,verbal,somatic,material,material_cost,description
0,Acid Splash,"Artificer, Sorcerer, Wizard",0,Conjuration,1 Action,60 Feet,Instantaneous,1,1,0,,You hurl a bubble of acid. Choose one creature...
1,Blade Ward,"Bard, Sorcerer, Warlock, Wizard",0,Abjuration,1 Action,Self,1 round,1,1,0,,You extend your hand and trace a sigil of ward...
2,Booming Blade,"Artificer, Sorcerer, Warlock, Wizard",0,Evocation,1 Action,Self (5-foot radius),1 round,0,1,1,a melee weapon worth at least 1 sp,You brandish the weapon used in the spell’s ca...
3,Chill Touch,"Sorcerer, Warlock, Wizard",0,Necromancy,1 Action,120 Feet,1 round,1,1,0,,"You create a ghostly, skeletal hand in the spa..."
4,Control Flames,"Druid, Sorcerer, Wizard",0,Transmutation,1 Action,60 Feet,Instantaneous or 1 hour,0,1,0,,You choose nonmagical flame that you can see w...


## Data Exploration

### Maximun number of token by row

In [4]:
!pip install spacy

Collecting spacy

ERROR: Could not install packages due to an OSError: HTTPSConnectionPool(host='files.pythonhosted.org', port=443): Max retries exceeded with url: /packages/23/1c/9f44f3258abd158cd45cf856242923629430d27651ddea9b8b53cdcc02d7/spacy-3.4.1-cp39-cp39-win_amd64.whl (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1129)')))








In [5]:
import spacy
npl = spacy.load('en_core_web_sm')

def count_words(x):
    y = npl(x)
    return len(y)

df_spells["count_tokens"] = df_spells.apply(lambda x: count_words(x['description']),axis=1)
df_spells["count_tokens"][:5]

ModuleNotFoundError: No module named 'spacy'

In [None]:
df_spells.describe()

### Number of words for the vocabulary

In [None]:
import tensorflow as tf

text_vectorization_layer = tf.keras.layers.TextVectorization(
 standardize="lower_and_strip_punctuation",
 output_mode='int',
output_sequence_length=800)

In [None]:
text_vectorization_layer.adapt(df_spells['description'])

In [None]:
print("We have " + str(len(text_vectorization_layer.get_vocabulary())) + " terms as vocabulary")

## Creating input X
I one_hot encode the values in the description column. Giving an integer number for each word (token). Finally, the empty space caused by the imbalanced number of tokens between rows of the column is filled with zeros.

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocabulary_size = 4504
df_descrip = df_spells['description']
input_x = [one_hot(d,vocabulary_size) for d in df_descrip]
max_length = 800
input_x = pad_sequences(input_x,maxlen=max_length,padding='post')
input_x[:2]

## Creating input Y
The label (school column) is categorized in a form the model can understand. Furthermore, to avoid problems by imbalanced big values between schools, it was used 'tf.keras.utils.to_categorical' to balance it.

In [None]:
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

label_encoder = LabelEncoder()
label_encoder.fit(df_spells['school'])
print('Labels: ', label_encoder.classes_)

def label_encode(labels,encoder):
    encoding = encoder.transform(labels)
    return tf.keras.utils.to_categorical(encoding,len(encoder.classes_))


def label_decode(labels,decoder):
    decoding = np.argmax(labels, axis=1)
    return decoder.inverse_transform(decoding)

In [None]:
input_y = label_encode(df_spells['school'],label_encoder)
input_y[:5]

## Modeling

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Embedding,Dense

#### Important:
They are 4504 words as vocabulary. And the number max of words in a description is 800.

In [None]:
regularizer = tf.keras.regularizers.l1_l2(0, 0.001)

model = Sequential()
embedding_layer = Embedding(input_dim=4504,output_dim=64,input_length=800)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(640,kernel_regularizer=regularizer,activation='relu'))
model.add(Dense(64,kernel_regularizer=regularizer,activation='relu'))
model.add(Dense(64,kernel_regularizer=regularizer,activation='relu'))
model.add(Dense(8,activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

In [None]:
history = model.fit(input_x,input_y,batch_size=32, epochs=15,verbose=0,validation_split=0.2, shuffle=True)

In [None]:
print(embedding_layer.get_weights()[0].shape)

In [None]:
embedding_layer.get_weights()[:20]

## Visualization

In [None]:
import matplotlib.pyplot as plt

def training_plot(metrics, history):
    f, ax = plt.subplots(1, len(metrics), figsize=(5 * len(metrics), 5))
    for idx, metric in enumerate(metrics):
        ax[idx].plot(history.history[metric], ls='dashed')
        ax[idx].set_xlabel("Epochs")
        ax[idx].set_ylabel(metric)
        ax[idx].plot(history.history['val_' + metric])
        ax[idx].legend([metric, 'val_' + metric])

In [None]:
training_plot(['loss', 'accuracy'], history)

## Some thoughts

Obviosly it needs some improving. However, the objective was to create a basic example of how to train a embedded layer to be used later.