# Set up

In [3]:
!git clone https://github.com/NLP-Reichman/assignment_1.git
!mv assignment_1/data data
!rm assignment_1/ -r

Cloning into 'assignment_1'...
remote: Enumerating objects: 150, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 150 (delta 35), reused 22 (delta 22), pack-reused 106[K
Receiving objects: 100% (150/150), 6.79 MiB | 13.54 MiB/s, done.
Resolving deltas: 100% (64/64), done.


# Introduction
In this assignment you will be creating tools for learning and testing language models. The corpora that you will be working with are lists of tweets in 8 different languages that use the Latin script. The data is provided either formatted as CSV or as JSON, for your convenience. The end goal is to write a set of tools that can detect the language of a given tweet.
The relevant files are under the data folder:

- en.csv (or the equivalent JSON file)
- es.csv (or the equivalent JSON file)
- fr.csv (or the equivalent JSON file)
- in.csv (or the equivalent JSON file)
- it.csv (or the equivalent JSON file)
- nl.csv (or the equivalent JSON file)
- pt.csv (or the equivalent JSON file)
- tl.csv (or the equivalent JSON file)

In [4]:
import json
from google.colab import files
import pandas as pd

# Implementation

## Part 1
Implement the function *preprocess* that iterates over all the data files and creates a single vocabulary, containing all the tokens in the data. Our token definition is a single UTF-8 encoded character. So, the vocabulary list is a simple Python list of all the characters that you see at least once in the data.

Note - do NOT lowercase the sentences in whi HW.

In [5]:
def preprocess() -> list[str]:
  '''
  Return a list of characters, representing the shared vocabulary of all languages
  '''

  import glob

  csv_files = glob.glob('data/*.csv')

  # Initialize an empty DataFrame to contain all the data
  all_csv_df = pd.DataFrame()

  # concat all *.csv
  for filename in csv_files:
    #print(f" filename = {filename}\n")
    df = pd.read_csv(filename)
    all_csv_df = pd.concat([all_csv_df, df], ignore_index=True)

  # get all the tweets
  all_tweets = all_csv_df['tweet_text'].values

  # convert the tweets from strings into a joint list of characters
  all_chars = []
  for t in range(len(all_tweets)):
    tweet = list(all_tweets[t])
    for letter in tweet:
      all_chars.append(letter)

  # add <start> & <end> keys
  s_and_e = ['<s>', '<e>']
  for t in range(len(s_and_e)):
    all_chars.append(s_and_e[t])

  # get the unique
  vocabulary = set(all_chars)

  return set(vocabulary)

In [6]:
len(preprocess())

1804

## Part 2
Implement the function *lm* that generates a language model from a textual corpus. The function should return a dictionary (representing a model) where the keys are all the relevant *n*-1 sequences, and the values are dictionaries with the *n*_th tokens and their corresponding probabilities to occur. For example, for a trigram model (tokens are characters), it should look something like:

{ "ab":{"c":0.5, "b":0.25, "d":0.25}, "ca":{"a":0.2, "b":0.7, "d":0.1} }

which means for example that after the sequence "ab", there is a 0.5 chance that "c" will appear, 0.25 for "b" to appear and 0.25 for "d" to appear.

Note - You should think how to add the add_one smoothing information to the dictionary and implement it.

In [7]:
def lm(lang: str, n: int, smoothed: bool = False) -> dict[str, dict[str, float]]:
    '''
    Return a language model for the given lang and n-gram (n), with an option for smoothing.
    :param lang: the language of the model
    :param n: the n_gram value
    :param smoothed: boolean indicating whether to apply smoothing
    :return: a dictionary where the keys are n-1 grams and the values are dictionaries
    '''
    from collections import Counter, defaultdict

    # get all tweets of the chosen language
    language = lang
    # print(f" lm language = {language}\n")
    file_path_and_name = ''.join(['data/', language, '.csv'])
    df = pd.read_csv(file_path_and_name)
    tweets = df['tweet_text'].values
    V = len(preprocess()) # vocabulary length

    # convert the tweets from strings into corpus
    corpus = []
    for t in range(len(tweets)):
      # pad every tweet with n-1 start tokens and 1 end token
      for s in range(n):
        corpus.append('<s>')

      tweet = list(tweets[t])
      for letter in tweet:
        corpus.append(letter)

      corpus.append('<e>')

    # Count the frequency of each and every n-gram in the corpus
    n_gram_counter = defaultdict(Counter) # init counters

    for c in range(n-1, len(corpus)):
        context = ''.join(corpus[c-n+1:c])
        token = corpus[c]
        n_gram_counter[context][token] += 1

    probs = defaultdict(Counter)
    for context, tokens in n_gram_counter.items():
      context_counter = sum(tokens.values())

      for token, count in tokens.items():
        if smoothed == False:
          probs[context][token] = count / context_counter
        else: # perform smoothing
          probs[context][token] = (count + 1) / (context_counter + V)

    # create a list of the unique dictionary keys taken from the corpus.
    # Note: the <e> key must not be included in the output dictionary
    all_context = []
    for c in range(n-1, len(corpus)):
      context_list = corpus[c-n+1:c]
      end_token_in_context = sum([int(char == '<e>') for char in context_list]) > 0

      if end_token_in_context == False:
        all_context.append(''.join(context_list))

    context_keys = list(set(all_context))

    # create the output dictionary
    n_gram_dict = {}
    for k in range(len(context_keys)):
      n_gram_dict.update({context_keys[k]: dict(probs[context_keys[k]])})

    # add the unknown token
    if smoothed == True:
      n_gram_dict.update({'<unk>': 1/V})

    return dict(n_gram_dict)

In [8]:
{
        'english_2_gram_length': len(lm('en', 2, True)),
        'english_3_gram_length': len(lm('en', 3, True)),
        'french_3_gram_length': len(lm('fr', 3, True)),
        'spanish_3_gram_length': len(lm('es', 3, True)),
    }

{'english_2_gram_length': 748,
 'english_3_gram_length': 8239,
 'french_3_gram_length': 8286,
 'spanish_3_gram_length': 8469}

## Part 3
Implement the function *eval* that returns the perplexity of a model (dictionary) running over the data file of the given target language.

In [9]:
def eval(model: dict, target_lang: str, n: int) -> float:
  '''
  Return the perplexity value calculated over applying the model on the text file
  of the target_lang language.
  :param model: the language model
  :param target_lang: the target language
  :param n: The n-gram of the model
  :return: the perplexity value
  '''
  import numpy as np

  # Get all tweets of the chosen language
  language = target_lang
  # print(f" target language = {language}\n")
  file_path_and_name = ''.join(['data/', language, '.csv'])
  df = pd.read_csv(file_path_and_name)
  tweets = df['tweet_text'].values

  # convert the tweets from strings into corpus
  corpus = []
  for t in range(len(tweets)):
    # Pad every tweet with n-1 start tokens and 1 end token
    for s in range(n):
      corpus.append('<s>')

    tweet = list(tweets[t])
    for letter in tweet:
      corpus.append(letter)

    corpus.append('<e>')

  # number of unique elements in the corpus
  V = len(preprocess()) # vocabulary length
  prob_unknwn = 1/V

  # Initialize variables
  sum_log_prob = 0.0
  n_tokens = len(corpus)

  # Calculate cross-entropy
  for c in range(n-1, n_tokens):
      context = ''.join(corpus[c-n+1:c])
      token = corpus[c]
      # extract probability and add up its log2 value
      # if it's an unknown token then use default probability (prob_unknwn)
      try:
        prob = model[context][token]
      except:
        prob = prob_unknwn

      sum_log_prob -= np.log2(prob)

  # Compute perplexity
  cross_entropy = sum_log_prob / n_tokens
  perplexity = np.power(2, cross_entropy)

  return perplexity

In [10]:
{
        'en_en': eval(lm('en', 3, True), 'en', 3),
        'en_fr': eval(lm('en', 3, True), 'fr', 3),
        'en_tl': eval(lm('en', 3, True), 'tl', 3),
        'en_nl': eval(lm('en', 3, True), 'nl', 3),
    }

{'en_en': 28.578718604332522,
 'en_fr': 59.52748418449823,
 'en_tl': 72.93649027000441,
 'en_nl': 63.101789298831456}

## Part 4
Implement the *match* function that calls *eval* using a specific value of *n* for every possible language pair among the languages we have data for. You should call *eval* for every language pair four times, with each call assign a different value for *n* (1-4). Each language pair is composed of the source language and the target language. Before you make the call, you need to call the *lm* function to create the language model for the source language. Then you can call *eval* with the language model and the target language. The function should return a pandas DataFrame with the following four columns: *source_lang*, *target_lang*, *n*, *perplexity*. The values for the first two columns are the two-letter language codes. The value for *n* is the *n* you use for generating the specific perplexity values which you should store in the forth column.

In [11]:
def match() -> pd.DataFrame:
  '''
  Return a DataFrame containing one line per every language pair and n_gram.
  Each line will contain the perplexity calculated when applying the language model
  of the source language on the text of the target language.
  :return: a DataFrame containing the perplexity values
  '''
  n_gram_max = 4
  languages = ['en', 'es', 'fr', 'in', 'it', 'nl', 'pt', 'tl']
  n_languages = len(languages)
  match_df = pd.DataFrame(columns=['source', 'target', 'n', 'perplexity'])
  counter = 0

  for n_gram in range(1, n_gram_max+1):
    print(f"match for n-gram = {n_gram}\n")
    for source in languages:
      language_model = lm(source, n_gram)

      for target in languages:
        perplexity = eval(language_model, target, n_gram)
        match_df.loc[counter] = [source, target, n_gram, perplexity]
        counter += 1

  return match_df

In [67]:
match_df = match()
match_df

match for n-gram = 1

match for n-gram = 2

match for n-gram = 3

match for n-gram = 4



Unnamed: 0,source,target,n,perplexity
0,en,en,1,38.577504
1,en,es,1,39.703161
2,en,fr,1,41.192687
3,en,in,1,41.236615
4,en,it,1,39.629555
...,...,...,...,...
251,tl,in,4,51.458478
252,tl,it,4,48.662568
253,tl,nl,4,73.317033
254,tl,pt,4,57.697890


## Part 5
Implement the *generate* function which takes a language code, *n*, the prompt (the starting text), the number of tokens to generate, and *r*, which is the random seed for any randomized action you plan to take in your implementation. The function should start generating tokens, one by one, using the language model of the given source language and *n*. The prompt should be used as a starting point for aligning on the probabilities to be used for generating the next token.

Note - The generation of the next token should be from the LM's distribution.

Note #2 - if you use an <END> token in your vocabulary, then once the <END> token is generated by the model, you should stop generating text.

In [12]:
def generate(lang: str, n: int, prompt: str, number_of_tokens: int, r: int) -> str:
  '''
  Generate text in the given language using the given parameters.
  :param lang: the language of the model
  :param n: the n_gram value
  :param prompt: the prompt to start the generation
  :param number_of_tokens: the number of tokens to generate
  :param r: the random seed to use
  '''
  import random
  random.seed(r)

  # give up to 10 chances to generate a token different from <e>
  e_iterations = 10

  # generate the model
  model = lm(lang, n, smoothed=False)

  # tokenize the prompt
  tokenized_prompt = []
  prompt_head = list(prompt)
  for letter in prompt_head:
    tokenized_prompt.append(letter)
  #print(f"tokenized_prompt init = {tokenized_prompt}\n")

  for t in range(number_of_tokens):
    if n == 1:
      # it's a uni-gram -> there is a single context
      context = list(model.keys())[0]

    elif len(tokenized_prompt) >= n-1:
      # there is enough tokens to proceed
      #print(f"tokenized_prompt n-1 = {tokenized_prompt[-(n-1):]}, type={type(tokenized_prompt[-(n-1):])}, len={len(tokenized_prompt[-(n-1):])}\n")
      context = ''.join(tokenized_prompt[-(n-1):])

    else:
      # we are short!
      # print a warning and pad it with starting spaces
      context = ''.join([' ']*(n-1-len(tokenized_prompt)) + tokenized_prompt)
      print(f"too short context = {context} -> padding with spaces\n")

    # check if the context key is included in the model, otherwise pick uniformly from the model
    if context not in model:
      # context is not included in our model -> pick randomly
      context = ''.join(random.choices(list(model.keys()), k=1))
      #print(f"pick key = {context}, type={type(context)}, len={len(context)}\n")

    # get the options from our model according to the context
    next_token_options = model[context]

    # pick a token and make sure it's not an <e> token (avoid getting <e> key since it's not included in the model)
    for e in range(e_iterations):
      next_token = ''.join(random.choices(list(next_token_options.keys()), weights=next_token_options.values(), k=1))
      if next_token != '<e>':
        break

    if next_token == '<e>':
        next_token = ' '

    # add the chosen token
    tokenized_prompt.append(next_token)

  # "glue" it all together
  generated_text = ''.join(tokenized_prompt)

  return generated_text

In [13]:
{
        'english_1_gram': generate('en', 1, "I", 20, 5),
        'english_2_gram': generate('en', 2, "I am", 20, 5),
        'english_3_gram': generate('en', 3, "I am", 20, 5),
        'english_4_gram': generate('en', 4, "I Love", 20, 5),
        'spanish_2_gram': generate('es', 2, "Soy", 20, 5),
        'spanish_3_gram': generate('es', 3, "Soy", 20, 5),
        'french_2_gram': generate('fr', 2, "Je suis", 20, 5),
        'french_3_gram': generate('fr', 3, "Je suis", 20, 5),
    }

{'english_1_gram': 'ItpgLpITeLhF eBstRlo2',
 'english_2_gram': 'I amoulpeginShmee bie ae',
 'english_3_gram': 'I amit: Lynmkm ways. htt',
 'english_4_gram': 'I Love gifts @OndMade a no',
 'spanish_2_gram': 'Soycalíodenyegucosie ew',
 'spanish_3_gram': 'Soy orbershagang https:',
 'french_2_gram': 'Je suis:/opapropades tprisl',
 'french_3_gram': 'Je suis tunes #ACTURSICALU '}

## Part 6
Play with your generate function, try to generate different texts in different language and various values of *n*. No need to submit anything of that.

# Testing

Copy the content of the **tests.py** file from the repo and paste below. This will create the results.json file and download it to your machine.

In [14]:
####################
# PLACE TESTS HERE #

####################
def test_preprocess():
    return {
        'vocab_length': len(preprocess()),
    }

def test_lm():
    return {
        'english_2_gram_length': len(lm('en', 2, True)),
        'english_3_gram_length': len(lm('en', 3, True)),
        'french_3_gram_length': len(lm('fr', 3, True)),
        'spanish_3_gram_length': len(lm('es', 3, True)),
    }

def test_eval():
    return {
        'en_en': eval(lm('en', 3, True), 'en', 3),
        'en_fr': eval(lm('en', 3, True), 'fr', 3),
        'en_tl': eval(lm('en', 3, True), 'tl', 3),
        'en_nl': eval(lm('en', 3, True), 'nl', 3),
    }

def test_match():
    df = match()
    return {
        'en_en_3': df[(df['source'] == 'en') & (df['target'] == 'en') & (df['n'] == 3)]['perplexity'].values[0],
        'en_tl_3': df[(df['source'] == 'en') & (df['target'] == 'tl') & (df['n'] == 3)]['perplexity'].values[0],
        'en_nl_3': df[(df['source'] == 'en') & (df['target'] == 'nl') & (df['n'] == 3)]['perplexity'].values[0],
    }

def test_generate():
    return {
        'english_1_gram': generate('en', 1, "I", 20, 5),
        'english_2_gram': generate('en', 2, "I am", 20, 5),
        'english_3_gram': generate('en', 3, "I am", 20, 5),
        'english_4_gram': generate('en', 4, "I Love", 20, 5),
        'spanish_2_gram': generate('es', 2, "Soy", 20, 5),
        'spanish_3_gram': generate('es', 3, "Soy", 20, 5),
        'french_2_gram': generate('fr', 2, "Je suis", 20, 5),
        'french_3_gram': generate('fr', 3, "Je suis", 20, 5),
    }

TESTS = [test_preprocess, test_lm, test_eval, test_match, test_generate]

# Run tests and save results
res = {}
for test in TESTS:
    try:
        cur_res = test()
        res.update({test.__name__: cur_res})
    except Exception as e:
        res.update({test.__name__: repr(e)})

with open('results.json', 'w') as f:
    json.dump(res, f, indent=2)

# Download the results.json file
files.download('results.json')

match for n-gram = 1

match for n-gram = 2

match for n-gram = 3

match for n-gram = 4



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
# Show the local files, results.json should be there now and
# also downloaded to your local machine
!ls -l

total 12
drwxr-xr-x 2 root root 4096 Apr 20 20:15 data
-rw-r--r-- 1 root root  920 Apr 20 20:30 results.json
drwxr-xr-x 1 root root 4096 Apr 18 13:25 sample_data


## check results.json

In [17]:
def test_preprocess(results):
    if results["vocab_length"] != 1804:
        return f"Vocab length is {results['vocab_length']}, expected 1804"
    return 1

def test_lm(results):
    if results["english_2_gram_length"] != 748:
        return f"English 2-gram length is {results['english_2_gram_length']}, expected 748"
    if results["english_3_gram_length"] != 8239:
        return f"English 3-gram length is {results['english_3_gram_length']}, expected 8239"
    if results["french_3_gram_length"] != 8286:
        return f"French 3-gram length is {results['french_3_gram_length']}, expected 8286"
    if results["spanish_3_gram_length"] != 8469:
        return f"Spanish 3-gram length is {results['spanish_3_gram_length']}, expected 8469"
    return 1

def relative_difference(expected, actual):
    """Calculate the relative difference between expected and actual values."""
    return abs(expected - actual) / expected

def test_eval(results):
    perplexity_en_on_en = float(results["en_en"])
    perplexity_en_on_fr = float(results["en_fr"])
    perplexity_en_on_tl = float(results["en_tl"])
    perplexity_en_on_nl = float(results["en_nl"])

    perplexities = [
        perplexity_en_on_en,
        perplexity_en_on_fr,
        perplexity_en_on_tl,
        perplexity_en_on_nl
    ]

    if min(perplexities) != perplexity_en_on_en:
        return f"English model should perform best on English text. Results: {results}"

    if not (perplexity_en_on_en <= perplexity_en_on_fr <= max(perplexity_en_on_tl, perplexity_en_on_nl)):
        return f"Expected increasing perplexity from English to other languages. Results: {results}"

    return 1


def test_match(results):
    perplexity_en_on_en = int(results["en_en_3"])
    perplexity_en_on_tl = int(results["en_tl_3"])
    perplexity_en_on_nl = int(results["en_nl_3"])

    perplexities = [
        perplexity_en_on_en,
        perplexity_en_on_tl,
        perplexity_en_on_nl
    ]

    if min(perplexities) != perplexity_en_on_en:
        return f"English model should perform best on English text. Results: {results}"

    if not (perplexity_en_on_en <= max(perplexity_en_on_tl, perplexity_en_on_nl)):
        return f"Expected increasing perplexity from English to other languages. Results: {results}"

    return 1


def test_generate(results):
    if not results["english_2_gram"].startswith("I am"):
        return f"English 2-gram does not start with 'I am', but with {results['english_2_gram']}"
    if not results["french_3_gram"].startswith("Je suis"):
        return f"French 3-gram does not start with 'Je suis', but with {results['french_3_gram']}"
    return 1


# Read results.json
with open('results.json', 'r') as f:
    results = json.load(f)

# Initialize the result variable
result = None

# Switch between the tests
result1 = test_preprocess(results["test_preprocess"])
print(result1)
result2 = test_lm(results["test_lm"])
print(result2)
result3 = test_eval(results["test_eval"])
print(result3)
result4 = test_match(results["test_match"])
print(result4)
result5 = test_generate(results["test_generate"])
print(result5)




1
1
1
1
1
