In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\iagoc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iagoc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iagoc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [39]:
from nltk.corpus import stopwords

In [3]:
import re

In [33]:
import spacy


In [70]:
# For the next text, perform the following actions
text = "The president of the U.S.A., Donald Trump, is 1.9m high and 78 years old. Forbes Magazine has assessed his wealth, currently estimating it at $5.5 billion as of mid-February 2025."

# (1 point) 1 - Use NLTK to split the sentences 
print("Splitted sentences: ", nltk.word_tokenize(text))

# (2 points) 2 - Convert with regex the acronym U.S.A. to USA, the number 1.9m to 190 centimeters or any other number of a height like that (e.g. 1.75m to 175 centimeters), and "$5.5 billion" to five point five billion.
import re

# Replace a dot immediately preceding a letter/digit with just that letter/digit.
text2 = re.sub(r'\.(?=[A-Za-z])', '', text)
text3 = re.sub(r'\.(?=[,\s])', '', text2)
#text4 = re.sub(r'\.+[1-9]+m', r'\1 0 centimeters', text2)
text4 = re.sub(r'(\d+)\.(\d{2})m\b', r'\g<1>\g<2> centimeters', text3)
text5 = re.sub(r'(\d+)\.(\d)m\b', r'\g<1>\g<2>0 centimeters', text4)
digit_words = {'0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
               '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'}
# Step 1: Remove the $ and extra space, so "$5.5 billion" becomes "5.5 billion"
text6 = re.sub(r'\$(\d+(?:\.\d+)?)\s*(billion|million|thousand)', r'\1 \2', text5, flags=re.IGNORECASE)

# Step 2: Convert digits and the period in the number to words.
# Replace each digit with its word form.
for digit, word in digit_words.items():
    text6 = re.sub(digit, word, text6)
# Replace the decimal point with " point "
text6 = re.sub(r'\.', ' point ', text6)

print("\nThe converted text is: ", text6)


# (1 point) 3 - Convert to lowercase except the proper nouns that must keep the original case. For the multiword proper names convert them to an unique word joining the two word with underscoere (Juan Fernández -> Juan_Fernández).
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)
normalized_tokens = [token.lower() for token in tokens]
clean_text = ' '.join(normalized_tokens)
print("\nLower case is: ", clean_text)
# (1 point) 4 - Tokenize the text (use the tool you prefer). 
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)
print("\nTokens are: ", tokens)
# (1 point) 5 - Remove the stopwords (use the tool you prefer). 
stop_words = set(stopwords.words('english'))
 
word_tokens = word_tokenize(text)
# converts the words in word_tokens to lower case and then checks whether 
#they are present in stop_words or not
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
#with no lower case conversion
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
clean_text = ' '.join(filtered_sentence)
print("\nText without stopwords: ", clean_text)
# (1 point) 6 - Create bigrams with pure python.
# Tokenize the text (text6) using TreebankWordTokenizer
tokens = tokenizer.tokenize(text6)

# Create a list to store the bigrams
bigram_list = []

# Build bigrams from the token list
for i in range(len(tokens) - 1):
    bigram = (tokens[i], tokens[i + 1])
    bigram_list.append(bigram)

# Now bigram_list contains all the bigrams
print("\nBigrams:", bigram_list)

# (2 point) 7 - Create a language model that predict the next word using bigrams. Please explain in the code how you made the calculations.
# Go through the list of tokens and count how many times each word is followed by another.

# Import needed tools
from nltk.tokenize import TreebankWordTokenizer
from collections import defaultdict, Counter

# Our sample text (this could be any text you want to model)
text6 = ("The president president president is is after president after president")

# Step 1: Tokenize the text into individual words
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(text6)
print("Tokens:", tokens)

# Step 2: Build a model of word pairs (bigrams).
# We use a dictionary where each key is a word, and the value is a counter (a tally)
# of words that come immediately after it.
bigram_model = defaultdict(Counter)

# Go through the list of tokens and count how many times each word is followed by another.
for i in range(len(tokens) - 1):
    current_word = tokens[i]
    next_word = tokens[i + 1]
    bigram_model[current_word][next_word] += 1

# For clarity, print the bigram model
print("\nBigram model (each word and the words that follow it with counts):")
for word, next_words in bigram_model.items():
    print(f"{word} -> {dict(next_words)}")

# Step 3: Create a simple function to predict the next word.
# Given a word, this function looks up the most common word that comes after it.
def predict_next_word(word):
    # Check if the word exists in our model
    if word in bigram_model:
        # Get the next words and their counts, and choose the one with the highest count
        most_common = bigram_model[word].most_common(1)
        if most_common:
            return most_common[0][0]
    return None

# Let's test our model by predicting the next word after some sample words.
sample_words = ["USA,", "Donald", "is", "Forbes"]
for word in sample_words:
    prediction = predict_next_word(word)
    print(f"\nPredicted word after '{word}': {prediction}")

# Explanation:
# 1. We first split the text into tokens (words).
# 2. We then create pairs of words (bigrams) and count how often each pair occurs.
#    For example, if "Donald" is often followed by "Trump", the model will record that.
# 3. Finally, the function 'predict_next_word' looks at the counts for a given word
#    and picks the most frequent next word.


Splitted sentences:  ['The', 'president', 'of', 'the', 'U.S.A.', ',', 'Donald', 'Trump', ',', 'is', '1.9m', 'high', 'and', '78', 'years', 'old', '.', 'Forbes', 'Magazine', 'has', 'assessed', 'his', 'wealth', ',', 'currently', 'estimating', 'it', 'at', '$', '5.5', 'billion', 'as', 'of', 'mid-February', '2025', '.']

The converted text is:  The president of the USA, Donald Trump, is oneninezero centimeters high and seveneight years old Forbes Magazine has assessed his wealth, currently estimating it at five point five billion as of mid-February twozerotwofive point 

Lower case is:  the president of the u.s.a. , donald trump , is 1.9m high and 78 years old. forbes magazine has assessed his wealth , currently estimating it at $ 5.5 billion as of mid-february 2025 .

Tokens are:  ['The', 'president', 'of', 'the', 'U.S.A.', ',', 'Donald', 'Trump', ',', 'is', '1.9m', 'high', 'and', '78', 'years', 'old.', 'Forbes', 'Magazine', 'has', 'assessed', 'his', 'wealth', ',', 'currently', 'estimating'