In [1]:
import random
import nltk
from nltk import bigrams, FreqDist, ConditionalFreqDist
from itertools import islice
import os
import string
import time
import tkinter as tk
from tkinter import messagebox

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
start_time = time.time()

input_data_dir = "data"
punctuation = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

def is_hidden(filepath):
    return os.path.basename(filepath).startswith(".")

text_data = ""
for filename in os.listdir(input_data_dir):
    filepath = os.path.join(input_data_dir, filename)
    if not is_hidden(filepath):
        with open(filepath) as infile:
            text = infile.read().translate(punctuation)
            text_data += text

words = nltk.word_tokenize(text_data.lower())
bi_grams = list(bigrams(words))
bi_gram_freq_dist = FreqDist(bi_grams)

topk = 3

top_bigrams_per_first_word = {}

for (first_word, second_word), freq in bi_gram_freq_dist.items():
    if first_word not in top_bigrams_per_first_word:
        top_bigrams_per_first_word[first_word] = []
    top_bigrams_per_first_word[first_word].append((freq, second_word))

for first_word in top_bigrams_per_first_word:
    top_bigrams_per_first_word[first_word].sort(reverse=True)
    top_bigrams_per_first_word[first_word] = [second_word for _, second_word in top_bigrams_per_first_word[first_word][:topk]]

filtered_bi_grams = [(first_word, second_word) for first_word in top_bigrams_per_first_word for second_word in top_bigrams_per_first_word[first_word]]
bi_gram_freq = ConditionalFreqDist(filtered_bi_grams)

end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

14.590470314025879


In [4]:
def generate_sentence(word, num_words):
    word = word.lower()
    for _ in range(num_words):
        print(word, end=" ")
        next_words = list(bi_gram_freq[word].keys())
        if next_words:
            word = random.choice(next_words)
        else:
            break
    print()

In [6]:
def generate_sentence():
    input_word = entry_word.get().lower()
    try:
        num_words = int(entry_num_words.get())
    except ValueError:
        messagebox.showerror("Error", "Please enter a valid number of words.")
        return

    sentence = ""
    for _ in range(num_words):
        sentence += input_word + " "
        next_words = list(bi_gram_freq[input_word].keys())
        if next_words:
            input_word = random.choice(next_words)
        else:
            break

    output_label.config(text=sentence.capitalize())

# Create main window
root = tk.Tk()
root.title("PEKKA")

# Create input frame
input_frame = tk.Frame(root)
input_frame.pack(pady=10)

# Input word label and entry
label_word = tk.Label(input_frame, text="Input word:")
label_word.grid(row=0, column=0)
entry_word = tk.Entry(input_frame)
entry_word.grid(row=0, column=1)
entry_word.focus()

# Number of words label and entry
label_num_words = tk.Label(input_frame, text="Number of words:")
label_num_words.grid(row=1, column=0)
entry_num_words = tk.Entry(input_frame)
entry_num_words.grid(row=1, column=1)

# Button to generate sentence
button_generate = tk.Button(input_frame, text="Generate", command=generate_sentence)
button_generate.grid(row=2, columnspan=2)

# Output frame
output_frame = tk.Frame(root)
output_frame.pack(pady=10)

# Output label to display generated sentence
output_label = tk.Label(output_frame, text="", wraplength=400)
output_label.pack()

root.mainloop()
