# Final Assignment 

## Instuctions
* An extra 20 points for those who implement ALL the functions themselves without using tensorflows tf.keras.preprocessing and TextVectorization.
* Use pyarrow to save the embedding map.
* Find most similar function should be efficient. 20 sec to wait for a result is too much. You should aim to < 2 sec (use "timeit" magic to verify)
* Output cells should not be too big. DO NOT DUMP A LOT OF DATA IN THE OUTPUT. NOTEOOKS THAT WON'T FOLLOW THIS INSTRUCTION WILL NOT BE CHECKED AND THEIR GRADE WILL BE SET TO ZERO.

### Read dataset 

In [1]:
import zipfile
import pandas as pd

zip_path = 'data/review_230k.zip'

def extract_parquet_from_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()
        print(f"Files in zip: {file_list}") 

        parquet_file_name = [f for f in file_list if f.endswith('.parquet')][0]

        with zip_ref.open(parquet_file_name) as file:
            df = pd.read_parquet(file)
    return df

data = extract_parquet_from_zip(zip_path)



Files in zip: ['review_230k.parquet']


In [2]:
def create_dataset(data: pd.DataFrame):
    """ Create the dataset in your preferrable format """
    return data

In [3]:
dataset = create_dataset(data)

#
print(dataset)

                                                title  \
0             Truly is "Jewel of the Upper Wets Side"   
1                             My home away from home!   
2                                          Great Stay   
3                                  Modern Convenience   
4       Its the best of the Andaz Brand in the US....   
...                                               ...   
230334                        Treated us like royalty   
230335        Fine time but some room for improvement   
230336                Great venue for corporate event   
230337                             Almost in the Loop   
230338                                    Comfortable   

                                                     text  
0       Stayed in a king suite for 11 nights and yes i...  
1       On every visit to NYC, the Hotel Beacon is the...  
2       This is a great property in Midtown. We two di...  
3       The Andaz is a nice hotel in a central locatio...  
4       I have 

In [4]:
first_title_value = dataset['text'].iloc[1]
print(dataset['title'].iloc[2])
print(dataset['text'].iloc[0])

Great Stay
Stayed in a king suite for 11 nights and yes it cots us a bit but we were happy with the standard of room, the location and the friendliness of the staff. Our room was on the 20th floor overlooking Broadway and the madhouse of the Fairway Market. Room was quite with no noise evident from the hallway or adjoining rooms. It was great to be able to open windows when we craved fresh rather than heated air. The beds, including the fold out sofa bed, were comfortable and the rooms were cleaned well. Wi-fi access worked like a dream with only one connectivity issue on our first night and this was promptly responded to with a call from the service provider to ensure that all was well. The location close to the 72nd Street subway station is great and the complimentary umbrellas on the drizzly days were greatly appreciated. It is fabulous to have the kitchen with cooking facilities and the access to a whole range of fresh foods directly across the road at Fairway.
This is the second t

### Clean and standardize the data

In [5]:

import string

def prepare_dataset(dataset):
    """ Clean and prepare the dataset before word encoding, returning a single 2D array """
    
    dataset['text'] = dataset['text'].fillna('')
    
    def clean_text(text):
        text = text.lower()
        sentences = text.split('.')
        cleaned_sentences = [
            sentence.translate(str.maketrans('', '', string.punctuation + string.digits)).strip()
            for sentence in sentences if sentence
        ]
        
        sentence_word_arrays = [sentence.split() for sentence in cleaned_sentences]
        return sentence_word_arrays
    dataset['tokenized_sentences'] = dataset['text'].apply(clean_text)
    all_tokenized_sentences = []
    for sentence_list in dataset['tokenized_sentences']:
        all_tokenized_sentences.extend(sentence_list)
    
    return all_tokenized_sentences

all_tokenized_sentences = prepare_dataset(dataset)

for sentence in all_tokenized_sentences[:10]:
    print(sentence) 


['stayed', 'in', 'a', 'king', 'suite', 'for', 'nights', 'and', 'yes', 'it', 'cots', 'us', 'a', 'bit', 'but', 'we', 'were', 'happy', 'with', 'the', 'standard', 'of', 'room', 'the', 'location', 'and', 'the', 'friendliness', 'of', 'the', 'staff']
['our', 'room', 'was', 'on', 'the', 'th', 'floor', 'overlooking', 'broadway', 'and', 'the', 'madhouse', 'of', 'the', 'fairway', 'market']
['room', 'was', 'quite', 'with', 'no', 'noise', 'evident', 'from', 'the', 'hallway', 'or', 'adjoining', 'rooms']
['it', 'was', 'great', 'to', 'be', 'able', 'to', 'open', 'windows', 'when', 'we', 'craved', 'fresh', 'rather', 'than', 'heated', 'air']
['the', 'beds', 'including', 'the', 'fold', 'out', 'sofa', 'bed', 'were', 'comfortable', 'and', 'the', 'rooms', 'were', 'cleaned', 'well']
['wifi', 'access', 'worked', 'like', 'a', 'dream', 'with', 'only', 'one', 'connectivity', 'issue', 'on', 'our', 'first', 'night', 'and', 'this', 'was', 'promptly', 'responded', 'to', 'with', 'a', 'call', 'from', 'the', 'service', 

### Create A vocabulary 

In [6]:
from collections import Counter
word_count_map = Counter()

for sentence in all_tokenized_sentences:
    word_count_map.update(sentence)
vocabulary = {word: idx for idx, word in enumerate(word_count_map.keys())}


### Word encoding 

In [7]:
def encode_dataset(dataset, vocabulary: dict[str, int]):
    """ Encode each word in the dataset based on the given vocabulary """
    encoded_dataset = []
    for sentence in dataset:
        encoded_sentence = [vocabulary[word] for word in sentence if word in vocabulary]
        encoded_dataset.append(encoded_sentence)
    
    return encoded_dataset

encoded_dataset = encode_dataset(all_tokenized_sentences, vocabulary)

for sentence in encoded_dataset[:10]:
    print(sentence)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 18, 22, 7, 18, 23, 20, 18, 24]
[25, 21, 26, 27, 18, 28, 29, 30, 31, 7, 18, 32, 20, 18, 33, 34]
[21, 26, 35, 17, 36, 37, 38, 39, 18, 40, 41, 42, 43]
[9, 26, 44, 45, 46, 47, 45, 48, 49, 50, 14, 51, 52, 53, 54, 55, 56]
[18, 57, 58, 18, 59, 60, 61, 62, 15, 63, 7, 18, 43, 15, 64, 65]
[66, 67, 68, 69, 2, 70, 17, 71, 72, 73, 74, 27, 25, 75, 76, 7, 77, 26, 78, 79, 45, 17, 2, 80, 39, 18, 81, 82, 45, 83, 84, 85, 26, 65]
[18, 22, 86, 45, 18, 87, 88, 89, 90, 91, 44, 7, 18, 92, 93, 27, 18, 94, 95, 15, 96, 97]
[9, 91, 98, 45, 99, 18, 100, 17, 101, 102, 7, 18, 67, 45, 2, 103, 104, 20, 52, 105, 106, 107, 18, 108, 109, 33]
[77, 91, 18, 110, 111, 84, 112, 20, 18, 113, 99, 0, 109, 18, 114, 7, 9, 115, 116, 46, 25, 117, 20, 118, 5, 119, 120]
[27, 121, 122, 45, 123, 18, 117, 114, 91, 18, 124, 14, 125, 45, 126]


Example using tensoflow 

In [8]:

from tensorflow.keras import layers

# you can also use "adapt
vocabulary = create_vocabulary(dataset) 

vectorize_layer = layers.TextVectorization(
    max_tokens=...,
    output_mode='int',
    output_sequence_length=...,
    vocabulary=vocabulary
)

# adds padding and [UNK] token
vocabulary = vectorize_layer.get_vocabulary()

encoded_text_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
encoded_dataset = encoded_text_ds.as_numpy_iterator()


NameError: name 'create_vocabulary' is not defined

### Generate Positive and Negative Pairs 

In [None]:
def generate_training_data(encoded_dataset, number_of_neg_samples: int):
    """ 
    Generate positive and negative pairs 
    param: encoded_dataset:  the encoded dataset
    param: number_of_neg_samples: Number of negative samples per positive pair 
    """
    pass

Example using tensoflow 

In [None]:
# understand what is the functionalify of this, before you use it.
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(len(vocabulary))


# Generate positive skip-gram pairs for a sequence
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      sequence,
      vocabulary_size=VOCABULARY_SIZE,
      sampling_table=sampling_table,
      window_size=...,
      negative_samples=0)


# Iterate over each positive skip-gram pair to produce training examples
# with a positive context word and negative samples.
for target_word, context_word in positive_skip_grams:
    
    context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)
    negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
      true_classes=context_class,
      num_true=1,
      num_sampled=NUMBER_OF_NEGATIVE_SAMPLES,
      unique=True,
      range_max=VOCABULARY_SIZE,
      seed=123,
      name="negative_sampling")

    # Build context and label vectors (for one target word)
    context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
    label = tf.constant([1] + [0] * NUMBER_OF_NEGATIVE_SAMPLES, dtype="int64")

    # Append each element from the training example to global lists.
    targets.append(target_word)
    contexts.append(context)
    labels.append(label)



### Define the model 

In [None]:
 # define you model here 

### Training

Example, there are other ways to train like using gradient tape

In [None]:
model.compile(optimizer=..., loss=..., metrics=[...])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="model_logs")

# checkpoint a model. here we save the best model relative to validation loss
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath="word2vec_model_w5_ns15_ckpt.h5", monitor=..., save_best_only=True)

# restore_best_weights - Whether to restore model weights from
# the epoch with the best value of the monitored quantity.
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor='accuracy')

history = model.fit(dataset, epochs=..., callbacks=[tensorboard_callback, checkpoint_cb, early_stopping_cb])

model.save(f"word2vec_model_w{WINDOW_SIZE}_ns{NEGATIVE_SAMPLES}.h5", include_optimizer=True)

### Save embeddings 

In [None]:

def save_embeddings(model, vocabulary: List[str], path: str):
    """ 
    Save the embeddings 
    param: model: the trained tf model 
    param: vocabulary: list of tokens 
    param: path: the path to save the embeddings map 

    hint: take the weights using model.get_layer(...).get_weights()
    """
    pass

### Find Most similar

Note that this must be efficient 

In [None]:
def find_most_similar(word: str, k: int = 10) -> List[Tuple[str, float]]:
    """ 
    Find most similar tokens to word 
    param: word: the word to find most similar words to 
    k: number of most similar words 
    """
    pass

    

### Dimentionality Reduction 

visualize some clusters (pick some subset of words to show the labels for)