In [63]:
# For the next text, perform the following actions
text = "The president of the U.S.A., Donald Trump, is 1.9m high and 78 years old. Forbes Magazine has assessed his wealth, currently estimating it at $5.5 billion as of mid-February 2025."

# (1 point) 1 - Use NLTK to split the sentences 

# (2 points) 2 - Convert with regex the acronym U.S.A. to USA, the number 1.9m to 190 centimeters or any other number of a height like that (e.g. 1.75m to 175 centimeters), and "$5.5 billion" to five point five billion. 

# (1 point) 3 - Convert to lowercase except the proper nouns that must keep the original case. For the multiword proper names convert them to an unique word joining the two word with underscoere (Juan Fernández -> Juan_Fernández).

# (1 point) 4 - Tokenize the text (use the tool you prefer). 

# (1 point) 5 - Remove the stopwords (use the tool you prefer). 

# (1 point) 6 - Create bigrams with pure python.

# (2 point) 7 - Create a language model that predict the next word using bigrams. Please explain in the code how you made the calculations.




### Imports

In [64]:
import nltk
import re
from num2words import num2words
from collections import defaultdict

### (1 point) 1 - Use NLTK to split the sentences 

In [65]:
sentences = nltk.sent_tokenize(text)

print(sentences)

['The president of the U.S.A., Donald Trump, is 1.9m high and 78 years old.', 'Forbes Magazine has assessed his wealth, currently estimating it at $5.5 billion as of mid-February 2025.']


### (2 points) 2 - Convert with regex the acronym U.S.A. to USA, the number 1.9m to 190 centimeters or any other number of a height like that (e.g. 1.75m to 175 centimeters), and "$5.5 billion" to five point five billion. 


In [66]:
text = re.sub(r'U\.S\.A\.', 'USA', text)

text = re.sub(r'(\d+\.\d+)m', lambda x: f"{int(float(x.group(1)) * 100)} centimeters", text)

def decimal_to_words(match):
    number = match.group(1)  # Extract the decimal number (e.g., "5.5")
    integer_part, decimal_part = number.split(".")  # Split into whole and decimal parts
    return f"{num2words(int(integer_part))} point {num2words(int(decimal_part))} billion"

text = re.sub(r'\$(\d+\.\d+)\sbillion', decimal_to_words, text)

print(text)

The president of the USA, Donald Trump, is 190 centimeters high and 78 years old. Forbes Magazine has assessed his wealth, currently estimating it at five point five billion as of mid-February 2025.


### (1 point) 3 - Convert to lowercase except the proper nouns that must keep the original case. For the multiword proper names convert them to an unique word joining the two word with underscoere (Juan Fernández -> Juan_Fernández).


In [67]:
def process_text(text):
    proper_nouns = ["USA", "Donald Trump", "Forbes Magazine", "mid-February"]
    
    for proper in proper_nouns:
        text = re.sub(r'\b' + re.escape(proper) + r'\b', proper.replace(" ", "_"), text)

    words = text.split()
    processed_words = []

    for word in words:
        if "_" in word or word.isupper() or word == "mid-February":  
            processed_words.append(word)
        else:
            processed_words.append(word.casefold())

    return " ".join(processed_words)

processed_text = process_text(text)
print(processed_text)


the president of the USA, Donald_Trump, is 190 centimeters high and 78 years old. Forbes_Magazine has assessed his wealth, currently estimating it at five point five billion as of mid-February 2025.


### (1 point) 4 - Tokenize the text (use the tool you prefer). 


In [68]:
processed_words = nltk.word_tokenize(processed_text)
processed_sentences = nltk.sent_tokenize(processed_text)
print(processed_sentences)
print(processed_words)

['the president of the USA, Donald_Trump, is 190 centimeters high and 78 years old.', 'Forbes_Magazine has assessed his wealth, currently estimating it at five point five billion as of mid-February 2025.']
['the', 'president', 'of', 'the', 'USA', ',', 'Donald_Trump', ',', 'is', '190', 'centimeters', 'high', 'and', '78', 'years', 'old', '.', 'Forbes_Magazine', 'has', 'assessed', 'his', 'wealth', ',', 'currently', 'estimating', 'it', 'at', 'five', 'point', 'five', 'billion', 'as', 'of', 'mid-February', '2025', '.']


### (1 point) 5 - Remove the stopwords (use the tool you prefer). 


In [69]:
# nltk.download('stopwords')

def removestopwords(tokens):
    stop_words = set(nltk.corpus.stopwords.words("english"))  # Get the stopwords
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

filtered_words = removestopwords(processed_words)

print(filtered_words)

['president', 'USA', ',', 'Donald_Trump', ',', '190', 'centimeters', 'high', '78', 'years', 'old', '.', 'Forbes_Magazine', 'assessed', 'wealth', ',', 'currently', 'estimating', 'five', 'point', 'five', 'billion', 'mid-February', '2025', '.']


### (1 point) 6 - Create bigrams with pure python.


In [70]:
def generate_bigrams(tokens):
    bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
    return bigrams

bigrams = generate_bigrams(processed_words)

print(bigrams)

[('the', 'president'), ('president', 'of'), ('of', 'the'), ('the', 'USA'), ('USA', ','), (',', 'Donald_Trump'), ('Donald_Trump', ','), (',', 'is'), ('is', '190'), ('190', 'centimeters'), ('centimeters', 'high'), ('high', 'and'), ('and', '78'), ('78', 'years'), ('years', 'old'), ('old', '.'), ('.', 'Forbes_Magazine'), ('Forbes_Magazine', 'has'), ('has', 'assessed'), ('assessed', 'his'), ('his', 'wealth'), ('wealth', ','), (',', 'currently'), ('currently', 'estimating'), ('estimating', 'it'), ('it', 'at'), ('at', 'five'), ('five', 'point'), ('point', 'five'), ('five', 'billion'), ('billion', 'as'), ('as', 'of'), ('of', 'mid-February'), ('mid-February', '2025'), ('2025', '.')]


### (2 point) 7 - Create a language model that predict the next word using bigrams. Please explain in the code how you made the calculations.


In [72]:
def train_bigram_model(tokens):
    # Generate bigrams
    bigrams = generate_bigrams(tokens)
    
    # Frequency of each word pair (bigram) and individual word counts
    bigram_counts = defaultdict(int)
    word_counts = defaultdict(int)
    
    # Count the bigrams and individual words
    for w1, w2 in bigrams:
        bigram_counts[(w1, w2)] += 1
        word_counts[w1] += 1

    return bigram_counts, word_counts

def predict_next_word(word, bigram_counts, word_counts):
    # Calculate the probability of each possible next word given the current word
    possible_bigrams = [(w1, w2) for (w1, w2) in bigram_counts if w1 == word]
    
    if not possible_bigrams:
        return None  # No possible next word found
    
    # Calculate the probability for each next word
    next_word_probabilities = {}
    for w1, w2 in possible_bigrams:
        next_word_probabilities[w2] = bigram_counts[(w1, w2)] / word_counts[w1]

    # Return the next word with the highest probability
    predicted_word = max(next_word_probabilities, key=next_word_probabilities.get)
    return predicted_word

# Train the model
bigram_counts, word_counts = train_bigram_model(processed_words)

# Predict the next word after a given word
current_word = input("Enter a word to predict the next word: ")
predicted_word = predict_next_word(current_word, bigram_counts, word_counts)

if predicted_word:
    print(f"The predicted next word after '{current_word}' is: {predicted_word}")
else:
    print(f"No prediction found for the word '{current_word}'.")

The predicted next word after 'Donald_Trump' is: ,
