In [None]:
# %%capture
# !pip install "git+https://github.com/google-deepmind/ai-foundations.git@main"

In [2]:
# Packages used.
import random # For sampling from probability distributions.
from collections import Counter, defaultdict # For counting n-grams.

import textwrap # For automatically adding linebreaks to long texts.
import pandas as pd # For construction and visualizing tables.

# Custom functions for providing feedback on your solutions.
from ai_foundations.feedback.course_1 import ngrams

# Dataset loading and Tokenization

## Dataset loading

In [3]:
africa_galore = pd.read_json(
    "https://storage.googleapis.com/dm-educational/assets/ai_foundations/africa_galore.json"
) # assign the dataset

dataset = africa_galore["description"]
print(f"The dataset consists of {dataset.shape[0]} paragraphs.")

The dataset consists of 232 paragraphs.


In [4]:
for paragraph in dataset [:10]:
    #textwrap automatically adds linebreakes
    formatted_paragraph = textwrap.fill(paragraph)
    print(f"{formatted_paragraph}")
    print("-----")

The Lagos air was thick with humidity, but the energy in the club was
electric. The band launched into a hypnotic Afrobeat groove, the drums
pounding out a complex polyrhythm, the horns blaring a soaring melody,
and the bass laying down a deep, funky foundation. A woman named Imani
moved effortlessly to the music, her body swaying in time with the
rhythm. The music seemed to flow through her, a powerful current of
energy and joy. All around her, people were dancing, singing, and
clapping, caught up in the infectious rhythm. The music was more than
just entertainment; it was a celebration of life, a connection to
their shared heritage, a vibrant expression of the soul of Lagos.
-----
The warm evening air in Accra was filled with the lilting melodies of
Highlife music. At a small bar tucked away on a side street, a band
played, the guitars weaving intricate patterns, the horns adding a
bright, joyful counterpoint. Kwame, a man with a wistful smile, sat at
a table nursing a beer, lost in 

## Tokenization

In [5]:
def space_tokenize(text: str) -> list[str]:
    """
    Split the string into tokens.
    Split on spaces.
    
    Args:
        text: The input text.
    
    Returns:
        A list of tokens. Returns empty list if text is empty or all spaces.
    """
    tokens = text.split(" ")
    return tokens

In [6]:
space_tokenize("Kanga, a colorful printed cloth is more than just a fabric.")

['Kanga,',
 'a',
 'colorful',
 'printed',
 'cloth',
 'is',
 'more',
 'than',
 'just',
 'a',
 'fabric.']

In [7]:
space_tokenize(dataset[0])

['The',
 'Lagos',
 'air',
 'was',
 'thick',
 'with',
 'humidity,',
 'but',
 'the',
 'energy',
 'in',
 'the',
 'club',
 'was',
 'electric.',
 'The',
 'band',
 'launched',
 'into',
 'a',
 'hypnotic',
 'Afrobeat',
 'groove,',
 'the',
 'drums',
 'pounding',
 'out',
 'a',
 'complex',
 'polyrhythm,',
 'the',
 'horns',
 'blaring',
 'a',
 'soaring',
 'melody,',
 'and',
 'the',
 'bass',
 'laying',
 'down',
 'a',
 'deep,',
 'funky',
 'foundation.',
 'A',
 'woman',
 'named',
 'Imani',
 'moved',
 'effortlessly',
 'to',
 'the',
 'music,',
 'her',
 'body',
 'swaying',
 'in',
 'time',
 'with',
 'the',
 'rhythm.',
 'The',
 'music',
 'seemed',
 'to',
 'flow',
 'through',
 'her,',
 'a',
 'powerful',
 'current',
 'of',
 'energy',
 'and',
 'joy.',
 'All',
 'around',
 'her,',
 'people',
 'were',
 'dancing,',
 'singing,',
 'and',
 'clapping,',
 'caught',
 'up',
 'in',
 'the',
 'infectious',
 'rhythm.',
 'The',
 'music',
 'was',
 'more',
 'than',
 'just',
 'entertainment;',
 'it',
 'was',
 'a',
 'celebration

## `Coding Activity 1`

In [26]:
all_unigrams = []
all_bigrams = []
all_trigrams = []


def generate_ngrams(text: str, n: int) -> list[tuple[str]]:
    """Generates n-grams from a given text.

    Args:
        text: The input text string.
        n: The size of the n-grams (e.g., 2 for bigrams, 3 for trigrams).

    Returns:
        A list of n-grams, each represented as a list of tokens.
    """

    # Tokenize text.
    tokens = space_tokenize(text)

    # Construct the list of n-grams.
    ngrams = []

    # Add your code here.
    num_of_tokens = len(tokens)

    for i in range(0, num_of_tokens - n + 1):
        ngrams.append(tuple(tokens[i : i + n]))

    return ngrams


for paragraph in dataset:
    # Calling `generate_ngrams` with n=1 constructs a list of unigrams.
    all_unigrams.extend(generate_ngrams(paragraph, n=1))
    # Calling `generate_ngrams` with n=2 constructs a list of bigrams (2-grams).
    all_bigrams.extend(generate_ngrams(paragraph, n=2))
    # Calling `generate_ngrams` with n=2 constructs a list of trigram (3-grams).
    all_trigrams.extend(generate_ngrams(paragraph, n=3))

print("First 10 Unigrams:", all_unigrams[:10])
print("First 10 Bigrams:", all_bigrams[:10])
print("First 10 Trigrams:", all_trigrams[:10])


First 10 Unigrams: [('The',), ('Lagos',), ('air',), ('was',), ('thick',), ('with',), ('humidity,',), ('but',), ('the',), ('energy',)]
First 10 Bigrams: [('The', 'Lagos'), ('Lagos', 'air'), ('air', 'was'), ('was', 'thick'), ('thick', 'with'), ('with', 'humidity,'), ('humidity,', 'but'), ('but', 'the'), ('the', 'energy'), ('energy', 'in')]
First 10 Trigrams: [('The', 'Lagos', 'air'), ('Lagos', 'air', 'was'), ('air', 'was', 'thick'), ('was', 'thick', 'with'), ('thick', 'with', 'humidity,'), ('with', 'humidity,', 'but'), ('humidity,', 'but', 'the'), ('but', 'the', 'energy'), ('the', 'energy', 'in'), ('energy', 'in', 'the')]


In [30]:
# counder objects examples

counter = Counter()
for word in ['red','blue','red','green','blue','blue']:
    counter[word] +=1
    
counter

Counter({'red': 2, 'blue': 3, 'green': 1})