# Project 4
## Students:
 > Steven Koprowicz, 
 > Matthew Walters 

In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import re

In [14]:
print(tf.__version__)# you may want to upgrade to 2.10.0 

2.12.0


## Task 1

In [183]:
class TransformerModel():
    def __init__(self, vocab_size, embed_dim=256, num_heads=2, num_blocks=1, ff_dim=256, maxlen=64, rate=0.1):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_blocks = num_blocks
        self.ff_dim = ff_dim
        self.maxlen = maxlen
        self.rate = rate

    def TransformerBlock(self, inputs):
        #create the transformer block as discribed in the writeup, 
        # use the Keras functional API (https://keras.io/guides/functional_api/)
        # Use the rate variable for the dropout layers
        
        # MultiHeadAttention layer, 
        # specifiy 'use_causal_mask=True' (https://keras.io/api/layers/attention_layers/multi_head_attention/)
        mha = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim, use_causal_mask=True)
         
        # drop-out layer
        first_dropout = layers.Dropout(self.rate)(mha)
        
        # sum inputs and the output of this drop-out layer
        first_sum = layers.Add()([inputs, first_dropout])

        # LayerNormalization layer, 
        # specifiy 'epsilon=1e-6' (https://keras.io/api/layers/normalization_layers/layer_normalization/)
        first_ln = layers.LayerNormalization(epsilon=1e-6)(first_sum)
        
        # dense
        first_dense = layers.Dense(self.ff_dim, activation="relu")(first_ln)

        # dense
        second_dense = layers.Dense(self.ff_dim, activation="relu")(first_dense)

        # drop-out layer
        second_dropout = layers.Dropout(self.rate)(second_dense)

        # sum the output of the first Layer Normalization layer and this drop-out layer
        second_sum = layers.Add()([first_ln, second_dropout])

        # LayerNormalization again
        second_ln = layers.LayerNormalization(epsilon=1e-6)(second_sum)

        output_dense = layers.Dense(self.ff_dim, activation="softmax")(second_ln)

        return output_dense
    
    def EmbeddingLayer(self, inputs):
        #create the embedding layer
        #create (1) an embedding for the tokens and (2) an embedding for the positions
        #you can use https://keras.io/api/layers/core_layers/embedding/ Embedding class
        #you can use tf.range to encode positions
        positions = tf.range(len(inputs))
        tokens = layers.Embedding(inputs, self.embed_dim)
        posits = layers.Embedding(positions, self.embed_dim)
        
        #add (1) and (2) and return the layer
        embedding_output = layers.Add()([tokens, posits])
        return embedding_output
    
    def create_model(self):
        #combine the EmbeddingLayer and num_blocks TransformerBlocks to create the model, 
        # use the Keras functional API (https://keras.io/guides/functional_api/)
        #use the SparseCategoricalCrossentropy loss function
        # (https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-class)
        inputs = keras.Input(shape=(784,))
        embedLayer = self.EmbeddingLayer(inputs)
        xBlock = self.TransformerBlock(embedLayer)
        for i in range(self.num_blocks-1):
            xBlock = self.TransformerBlock(xBlock)
        self.model = xBlock
        self.model.compile(loss=keras.losses.SparseCategoricalCrossentropy(), optimizer='adam')
        return self.model

## Task 2

In [180]:
class DataSet():
    def __init__(self, filename, len):
        #load the text from the file
        self.filename=filename
        self.len = len
        with open(filename, "r") as f:
            self.data = f.readlines()
        
    def prep_text(self):
        #remove all punctuation, set to lowercase, remove duplicate spaces and other whitespace (keep newlines)
        self.data = [re.sub("--"," ",line) for line in self.data]
        self.data = [re.sub("\n"," \n$",line) for line in self.data]
        self.data = [re.sub("[^A-Za-z0-9\\s]","",line) for line in self.data]
        self.data = [re.sub("((^|(?<=[^\\S\n]))[^\\S\n]++(?!$)|[^\\S\n]+(?= \\n?$))","",line) for line in self.data]
        self.data = [re.sub("\\s++$"," \n",line) for line in self.data if (re.search("[A-Za-z0-9]",line) is not None)]
        return self.data
        
    def tokenize_text(self):
        #seperate into words, create a vocab and convert the text to a list of numbers using the vocab such that each unique word is represented by its own number number
        vocab_lines = [re.split(" +",line) for line in self.data]
        self.words = []
        for vl in vocab_lines:
            self.words+=vl
        vocab_list = np.unique(self.words)
        self.vocabulary = {word:number for (word,number) in zip(vocab_list,range(len(vocab_list)))}
        self.integers = [self.vocabulary[word] for word in self.words]
        return self.integers

    def create_dataset(self):
        #split the tokenized data into sequences of length len, return the sequences and vocab
        self.prep_text()
        self.tokenize_text()
        self.x = []
        self.y = []
        if self.len==0 or self.len>len(self.integers):
           return (self.x, self.y, self.vocabulary)
        # this loop makes (#tokens - len + 1) sequences (1 is removed because of the offset)
        for i in range(len(self.integers)-self.len+1):
            self.x.append(self.integers[i:(i+self.len)])
        self.y = self.x[1:]
        self.x = self.x[:-1]
        # this loop makes floor(#tokens / len) sequences
        # for i in range((len(self.integers)-1)//self.len):
        #     self.x.append(self.integers[(self.len*i):(self.len*(i+1))])
        #     self.y.append(self.integers[(self.len*i+1):(self.len*(i+1)+1)])
        return (self.x, self.y, self.vocabulary)


## Task 3

In [181]:
class GenerateText():
    def __init__(self, model, vocab):
        self.model = model
        # vocab is a dictionary and thus already is our map from integers to human words
        self.words_to_ints = vocab
        self.ints_to_words = {v: k for k, v in vocab.items()}
        self.vocabulary_list = vocab.keys()

    def generate_text(self, start_string, num_generate=100):
        #generate text using the model and vocab, start with the start_string and generate num_generate words
        text_out = []
        text_out.append(start_string)
        start_string = start_string
        start_string = re.sub("--"," ",start_string)
        start_string = re.sub("\n"," \n ",start_string)
        start_string = re.sub("[^A-Za-z0-9\\s]","",start_string)
        start_string = re.sub("[^\\S\n]++"," ",start_string)
        start_string = re.sub("([^\\S\n]++$|^[^\\S\n]++)","",start_string)
        output_string = start_string
        
        start_text_seq = re.split("[^\\S\n]+",start_string)
        output_int_seq = [self.words_to_ints[word] if word in self.vocabulary_list else -1 for word in start_text_seq]
        
        for i in range(num_generate):
            next_int_word = self.model.predict(output_int_seq)
            output_string = output_string + " " + self.ints_to_words[next_int_word]

        return(output_string)

    def generate_random_text(self, num_generate=100):
        text_out = []
        starter = self.vocabulary_list[np.random.randint(low=0,high=len(self.vocabulary_list))]
        text_out = generate_text(starter, num_generate)
        return text_out

## Task 4: Model Traning and Testing

In [184]:
#Train the model while periodically generating text to show progress
def train_model(model, vocab, x, y, epochs=50):
    return model    

ds = DataSet("beatles.txt",13)
x,y,vocab_map = ds.create_dataset()
tmodel_object = TransformerModel(len(vocab_map.keys()))
model = tmodel_object.create_model()

print(model)

generator  = GenerateText(model,vocab_map)
generator.generate_text("hungry eyes")

TypeError: object of type 'int' has no len()


# Report

## Introduction
##### In this project, we set out to construct a transformer model to generate text. The model is trained on the lyrics of Beatles songs. The song lyrics have been stripped of all punctuation, but keep individual spaces between words, and newlines are kept in the text so that the model might generate newlines in the text that it creates. In the end, the model should be able to generate rudimentary lyrics of fictitious Beatles music.

## Results

## Conclusion

## How to Run Code

We used only the libraries provided at the top of this file, and our tf version is 
2.11.0