In [None]:
#load shakespeare dataset
import numpy as np
import pandas as pd


In [6]:
data = np.load("shakespeare_training_sequences.npz")
X_train = data["X_train"]
y_train = data["y_train"]

#reconstruct training sentences using json
id_to_word = pd.read_json("shakespeare_id_to_word.json", typ="series").to_dict()
X_words = [" ".join([id_to_word[id] for id in seq if id != 0]) for seq in X_train]
y_words = [id_to_word[id] for id in y_train]

df = pd.DataFrame({"input": list(X_words), "target": list(y_words)})

In [None]:
df

Unnamed: 0,input,target
0,<SOS>,the
1,<SOS> the,complete
2,<SOS> the complete,works
3,<SOS> the complete works,of
4,<SOS> the complete works of,william


In [14]:
#see only those rows whose target is <EOS>
df_eos = df[df["target"] == "<EOS>"]
print(f"{len(df_eos)} of {len(df)} rows ({len(df_eos) / len(df) * 100:.2f}%) have <EOS> as target")
df_eos.head()

142769 of 1100120 rows (12.98%) have <EOS> as target


Unnamed: 0,input,target
6,<SOS> the complete works of william shakespeare,<EOS>
10,<SOS> by william shakespeare,<EOS>
12,<SOS> content,<EOS>
15,<SOS> the sonnets,<EOS>
21,<SOS> all well that end well,<EOS>


In [15]:
#see only those rows that starts with <SOS>
df_sos = df[df["input"].str.startswith("<SOS>")]
print(f"{len(df_sos)} of {len(df)} rows ({len(df_sos) / len(df) * 100:.2f}%) start with <SOS>")
df_sos.head()

1100120 of 1100120 rows (100.00%) start with <SOS>


Unnamed: 0,input,target
0,<SOS>,the
1,<SOS> the,complete
2,<SOS> the complete,works
3,<SOS> the complete works,of
4,<SOS> the complete works of,william


In [18]:
#see how many rows only have <SOS> and <EOS>, i.e. length of 2
df_sos_eos = df[(df["input"] == "<SOS>") & (df["target"] == "<EOS>")] 
print(f"{len(df_sos_eos)} of {len(df)} rows ({len(df_sos_eos) / len(df) * 100:.2f}%) have only <SOS> and <EOS>")
df_sos_eos.head()

161 of 1100120 rows (0.01%) have only <SOS> and <EOS>


Unnamed: 0,input,target
270,<SOS>,<EOS>
389,<SOS>,<EOS>
518,<SOS>,<EOS>
649,<SOS>,<EOS>
765,<SOS>,<EOS>
