In [1]:
# %%capture
# !pip install "git+https://github.com/google-deepmind/ai-foundations.git@main"

In [2]:
# Packages used.
import random # For sampling from probability distributions.
from collections import Counter, defaultdict # For counting n-grams.

import textwrap # For automatically adding linebreaks to long texts.
import pandas as pd # For construction and visualizing tables.

# Custom functions for providing feedback on your solutions.
from ai_foundations.feedback.course_1 import ngrams

# Dataset loading and Tokenization

## Dataset loading

In [3]:
africa_galore = pd.read_json(
    "https://storage.googleapis.com/dm-educational/assets/ai_foundations/africa_galore.json"
) # assign the dataset

dataset = africa_galore["description"]
print(f"The dataset consists of {dataset.shape[0]} paragraphs.")

The dataset consists of 232 paragraphs.


In [4]:
for paragraph in dataset [:10]:
    #textwrap automatically adds linebreakes
    formatted_paragraph = textwrap.fill(paragraph)
    print(f"{formatted_paragraph}")
    print("-----")

The Lagos air was thick with humidity, but the energy in the club was
electric. The band launched into a hypnotic Afrobeat groove, the drums
pounding out a complex polyrhythm, the horns blaring a soaring melody,
and the bass laying down a deep, funky foundation. A woman named Imani
moved effortlessly to the music, her body swaying in time with the
rhythm. The music seemed to flow through her, a powerful current of
energy and joy. All around her, people were dancing, singing, and
clapping, caught up in the infectious rhythm. The music was more than
just entertainment; it was a celebration of life, a connection to
their shared heritage, a vibrant expression of the soul of Lagos.
-----
The warm evening air in Accra was filled with the lilting melodies of
Highlife music. At a small bar tucked away on a side street, a band
played, the guitars weaving intricate patterns, the horns adding a
bright, joyful counterpoint. Kwame, a man with a wistful smile, sat at
a table nursing a beer, lost in 

## Tokenization

In [5]:
def space_tokenize(text: str) -> list[str]:
    """
    Split the string into tokens.
    Split on spaces.
    
    Args:
        text: The input text.
    
    Returns:
        A list of tokens. Returns empty list if text is empty or all spaces.
    """
    tokens = text.split(" ")
    return tokens

In [6]:
space_tokenize("Kanga, a colorful printed cloth is more than just a fabric.")

['Kanga,',
 'a',
 'colorful',
 'printed',
 'cloth',
 'is',
 'more',
 'than',
 'just',
 'a',
 'fabric.']

In [7]:
space_tokenize(dataset[0])

['The',
 'Lagos',
 'air',
 'was',
 'thick',
 'with',
 'humidity,',
 'but',
 'the',
 'energy',
 'in',
 'the',
 'club',
 'was',
 'electric.',
 'The',
 'band',
 'launched',
 'into',
 'a',
 'hypnotic',
 'Afrobeat',
 'groove,',
 'the',
 'drums',
 'pounding',
 'out',
 'a',
 'complex',
 'polyrhythm,',
 'the',
 'horns',
 'blaring',
 'a',
 'soaring',
 'melody,',
 'and',
 'the',
 'bass',
 'laying',
 'down',
 'a',
 'deep,',
 'funky',
 'foundation.',
 'A',
 'woman',
 'named',
 'Imani',
 'moved',
 'effortlessly',
 'to',
 'the',
 'music,',
 'her',
 'body',
 'swaying',
 'in',
 'time',
 'with',
 'the',
 'rhythm.',
 'The',
 'music',
 'seemed',
 'to',
 'flow',
 'through',
 'her,',
 'a',
 'powerful',
 'current',
 'of',
 'energy',
 'and',
 'joy.',
 'All',
 'around',
 'her,',
 'people',
 'were',
 'dancing,',
 'singing,',
 'and',
 'clapping,',
 'caught',
 'up',
 'in',
 'the',
 'infectious',
 'rhythm.',
 'The',
 'music',
 'was',
 'more',
 'than',
 'just',
 'entertainment;',
 'it',
 'was',
 'a',
 'celebration

## `Coding Activity 1` : From list of tokens to n-grams

In [8]:
test=[]
print(test)
test.extend("5")
print(test)

[]
['5']


In [9]:
all_unigrams = []
all_bigrams = []
all_trigrams = []


def generate_ngrams(text: str, n: int) -> list[tuple[str]]:
    """Generates n-grams from a given text.

    Args:
        text: The input text string.
        n: The size of the n-grams (e.g., 2 for bigrams, 3 for trigrams).

    Returns:
        A list of n-grams, each represented as a list of tokens.
    """

    # Tokenize text.
    tokens = space_tokenize(text)

    # Construct the list of n-grams.
    ngrams = []

    # Add your code here.
    num_of_tokens = len(tokens)

    for i in range(0, num_of_tokens - n + 1):
        ngrams.append(tuple(tokens[i : i + n]))

    return ngrams


for paragraph in dataset:
    # Calling `generate_ngrams` with n=1 constructs a list of unigrams.
    all_unigrams.extend(generate_ngrams(paragraph, n=1))
    # Calling `generate_ngrams` with n=2 constructs a list of bigrams (2-grams).
    all_bigrams.extend(generate_ngrams(paragraph, n=2))
    # Calling `generate_ngrams` with n=2 constructs a list of trigram (3-grams).
    all_trigrams.extend(generate_ngrams(paragraph, n=3))

print("First 10 Unigrams:", all_unigrams[:10])
print()
print("First 10 Bigrams:", all_bigrams[:10])
print()
print("First 10 Trigrams:", all_trigrams[:10])
print()


First 10 Unigrams: [('The',), ('Lagos',), ('air',), ('was',), ('thick',), ('with',), ('humidity,',), ('but',), ('the',), ('energy',)]

First 10 Bigrams: [('The', 'Lagos'), ('Lagos', 'air'), ('air', 'was'), ('was', 'thick'), ('thick', 'with'), ('with', 'humidity,'), ('humidity,', 'but'), ('but', 'the'), ('the', 'energy'), ('energy', 'in')]

First 10 Trigrams: [('The', 'Lagos', 'air'), ('Lagos', 'air', 'was'), ('air', 'was', 'thick'), ('was', 'thick', 'with'), ('thick', 'with', 'humidity,'), ('with', 'humidity,', 'but'), ('humidity,', 'but', 'the'), ('but', 'the', 'energy'), ('the', 'energy', 'in'), ('energy', 'in', 'the')]



In [9]:
# counder objects examples

counter = Counter()
for word in ['red','blue','red','green','blue','blue']:
    counter[word] +=1
    
counter

Counter({'red': 2, 'blue': 3, 'green': 1})

In [10]:
# use the python counter data type for computing the count of all bigrams.
bigram_counts = Counter(all_bigrams)

# print the ten most common bigram
print("Most common bigram : ")
for bigram, count in bigram_counts.most_common(10):
    print(f"  ({bigram}, {count})")
    
# use the python counter data type for computing the count of all trigram.
trigram_counts = Counter(all_trigrams)

# print the ten most common trigrams.
print("\n\nMost common trigrams : ")
for trigram, count in trigram_counts.most_common(10):
    print(f"  ({trigram}, {count})")


Most common bigram : 
  (('is', 'a'), 144)
  (('of', 'the'), 100)
  (('and', 'the'), 69)
  (('in', 'the'), 61)
  (('with', 'a'), 60)
  (('in', 'a'), 55)
  (('and', 'a'), 50)
  (('to', 'the'), 42)
  (('was', 'a'), 39)
  (('It', 'is'), 33)


Most common trigrams : 
  (('went', 'looking', 'for'), 32)
  (('a', 'symbol', 'of'), 18)
  (('was', 'hungry', 'so'), 18)
  (('The', 'result', 'is'), 17)
  (('looking', 'for', 'a'), 17)
  (('she', 'went', 'looking'), 16)
  (('he', 'went', 'looking'), 16)
  (('result', 'is', 'a'), 15)
  (('so', 'he', 'went'), 14)
  (('so', 'she', 'went'), 14)


## `Coding Activity 2` : Counting n-grams

~~~
{
  "Table Mountain": Counter({"is": 2}),
  "Mountain is": Counter({"tall": 1, "beautiful": 1})   
}
~~~

In [11]:
# defaultdict examples
s = [('yellow', 1), ('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
d = defaultdict(list)
for k,v in s:
    d[k].append(v)
    
sorted(d.items())

[('blue', [2, 4]), ('red', [1]), ('yellow', [1, 3])]

In [12]:
def get_ngram_counts(dataset: list[str], n: int) -> dict[str, Counter]:
    """Computes the n-gram counts from a dataset.

    This function takes a list of text strings (paragraphs or sentences) as
    input, constructs n-grams from each text, and creates a dictionary where:

    * Keys represent n-1 token long contexts `context`.
    * Values are a Counter object `counts` such that `counts[next_token]` is the
      count of `next_token` following `context`.

    Args:
        dataset: The list of text strings in the dataset.
        n: The size of the n-grams to generate (e.g., 2 for bigrams, 3 for
            trigrams).

    Returns:
        A dictionary where keys are (n-1)-token contexts and values are Counter
        objects storing the counts of each next token for that context.

    """

    # Define the dictionary as a defaultdict that is automatically initialized
    # with an empty Counter object. This allows you to access and set the value
    # of ngram_counts[context][next_token] without initializing
    # ngram_counts[context] or ngram_counts[context][next_token] first.
    # Reference
    # https://docs.python.org/3/library/collections.html#collections.Counter and
    # https://docs.python.org/3/library/collections.html#collections.defaultdict
    # for more information on how to use defaultdict and Counter types.
    ngram_counts = defaultdict(Counter)

    for paragraph in dataset:
        # print(paragraph)
        for ngram in generate_ngrams(paragraph, n):
            # ngram[:-1] : remove the last token form a list. ['the', 'quick', 'brown'][:-1] results in the list ['the', 'quick']
            # " ".join(...) create a string. " ".join(['the', 'quick']) results in the single string "the quick"
            context = " ".join(ngram[:-1])
            next_token = ngram[-1]  # get the last token fome the current token list
            ngram_counts[context][next_token] += 1
            # print(ngram)
            # print(ngram_counts)
            # print(context)

    return dict(ngram_counts)


# Example usage of the function.
example_data = [
    "This is an example sentence.",
    "Another example sentence.",
    "Split a sentence.",
]
ngram_counts = get_ngram_counts(example_data, 2)

# Print the bigram counts dictionary for the dataset consisting of the
# three example sentences.
print("Bigram counts dictionary:\n")
print("{")
for context, counter in ngram_counts.items():
    print(f"  '{context}': {counter},")
print("}")

Bigram counts dictionary:

{
  'This': Counter({'is': 1}),
  'is': Counter({'an': 1}),
  'an': Counter({'example': 1}),
  'example': Counter({'sentence.': 2}),
  'Another': Counter({'example': 1}),
  'Split': Counter({'a': 1}),
  'a': Counter({'sentence.': 1}),
}


In [13]:
# Example usage of the function.
example_data = [
    "Table Mountain is tall.",
    "Table Mountain is beautiful.",
]
ngram_counts = get_ngram_counts(example_data, 3)

# Print the bigram counts dictionary for the dataset consisting of the
# three example sentences.
print("Bigram counts dictionary:\n")
print("{")
for context, counter in ngram_counts.items():
    print(f"  '{context}': {counter},")
print("}")

Bigram counts dictionary:

{
  'Table Mountain': Counter({'is': 2}),
  'Mountain is': Counter({'tall.': 1, 'beautiful.': 1}),
}


In [14]:
# @title Run this cell to test your implementation.
ngrams.test_ngram_counts(get_ngram_counts, generate_ngrams)

✅ Nice! Your implementation looks correct.


In [15]:
bigram_counts = get_ngram_counts(dataset, n=2)

# Use the pandas library to display the counts in a table.
bigram_counts_matrix = {
    context: dict(counts) for context, counts in bigram_counts.items()
}
bigram_data_frame = pd.DataFrame.from_dict(
    bigram_counts_matrix, orient="index").fillna(0)

display(bigram_data_frame)

zero_count = (bigram_data_frame == 0).sum().sum()
print(
    f"Number of bigrams with a count of 0: {zero_count:,}"
    f" ({zero_count/bigram_data_frame.size * 100:.2f}%)"
)

Unnamed: 0,Lagos,band,music,warm,Highlife,bustling,Dakar,Mbalax,Kinshasa,Soukous,...,"kudu,","mph),",Ostriches,Antarctic,plumage,surface.,(Spheniscus,demersus).,breed,Bay
The,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
of,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the,0.0,1.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a,0.0,1.0,0.0,6.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
with,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
water's,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Penguin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
(Spheniscus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
penguins,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Number of bigrams with a count of 0: 26,606,550 (99.95%)


In [16]:
trigram_counts = get_ngram_counts(dataset, n=3)

# Use the pandas library to display the counts in a table.
trigram_counts_matrix = {
    context: dict(counts) for context, counts in trigram_counts.items()
}
trigram_data_frame = pd.DataFrame.from_dict(
    trigram_counts_matrix, orient="index").fillna(0)

display(trigram_data_frame)

zero_count = (trigram_data_frame == 0).sum().sum()
print(
    f"Number of trigrams with a count of 0: {zero_count:,}"
    f" ({zero_count/trigram_data_frame.size * 100:.2f}%)"
)

Unnamed: 0,air,was,thick,thin,always,"quiet,",filled,alive,with,"humidity,",...,plumage,water's,surface.,penguin,(Spheniscus,demersus).,penguins,breed,Algoa,Bay
The Lagos,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
in the,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
and the,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
warm evening,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vegetables. The,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Penguin (Spheniscus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
demersus). These,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
These penguins,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
down to,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Number of trigrams with a count of 0: 68,942,324 (99.98%)


In [17]:
context = "a staple"
trigram_counts[context]

Counter({'food': 1,
         'in': 6,
         'dish': 2,
         'throughout': 1,
         'of': 1,
         'at': 1,
         'beverage': 1})

In [18]:

context = "a staple"
# Compute the bigram count for "a staple" with sum().
bigram_count_a_staple = sum(trigram_counts[context].values())

print(
    'Bigram count of "a staple" computed indirectly from trigram counts: ',
    bigram_count_a_staple,
)

# Extract the bigram count for "a staple" from bigram_counts.
print('Bigram count of "a staple" computed directly: ',
      bigram_counts["a"]["staple"])
     

Bigram count of "a staple" computed indirectly from trigram counts:  13
Bigram count of "a staple" computed directly:  13


## `Coding Activity 3` : Computing the n-gram probabilities


~~~
{
  "Table Mountain": {"is": 1.0},
  "Mountain is": {"tall": 0.5, "beautiful": 0.5}   
}
~~~

In [19]:
def build_ngram_model(
    dataset: list[str],
    n: int
) -> dict[str, dict[str, float]]:
    """Builds an n-gram language model.

    This function takes a list of text strings (paragraphs or sentences) as
    input, generates n-grams from each text using the function get_ngram_counts
    and converts them into probabilities.  The resulting model is a dictionary,
    where keys are (n-1)-token contexts and values are dictionaries mapping
    possible next tokens to their conditional probabilities given the context.

    Args:
        dataset: A list of text strings representing the dataset.
        n: The size of the n-grams (e.g., 2 for a bigram model).

    Returns:
        A dictionary representing the n-gram language model, where keys are
        (n-1)-tokens contexts and values are dictionaries mapping possible next
        tokens to their conditional probabilities.
    """

    # A dictionary to store P(B | A).
    # ngram_model[context][token] should store P(token | context).
    ngram_model = {}

    # Use the ngram_counts as computed by the get_ngram_counts function.
    ngram_counts = get_ngram_counts(dataset, n)

    # Loop through the possible contexts. `context` is a string
    # and `next_tokens` is a dictionary mapping possible next tokens to their
    # counts of following `context`.
    for context, next_tokens in ngram_counts.items():

        # Compute Count(A) and P(B | A) here.
        context_total_count = sum(next_tokens.values())
        ngram_model[context]={}
        for token, count in next_tokens.items():
            ngram_model[context][token] = count / context_total_count

    return ngram_model

# Test the method above by bulding a simple trigram model.
test_dataset = ["Table Mountain is tall.", "Table Mountain is beautiful."]
test_trigram_model = build_ngram_model(test_dataset, n=3)
test_trigram_model

{'Table Mountain': {'is': 1.0},
 'Mountain is': {'tall.': 0.5, 'beautiful.': 0.5}}

In [36]:
# @title Run this cell to test your implementation.
ngrams.test_build_ngram_model(build_ngram_model, get_ngram_counts)

✅ Nice! Your implementation looks correct.


In [37]:
trigram_model = build_ngram_model(dataset, n=3)

In [39]:
print(f"P(B | \"as it\") = {trigram_model['as it']}")
print(f"P(B | \"as they\") = {trigram_model['as they']}")

P(B | "as it") = {'is': 0.6666666666666666, 'receives': 0.3333333333333333}
P(B | "as they") = {'were': 1.0}


In [40]:
context = "The name"
trigram_model[context]

{'means': 0.6666666666666666, "'Etosha'": 0.3333333333333333}

In [41]:
context = "Their name"
trigram_model[context]

KeyError: 'Their name'