<a href="https://colab.research.google.com/github/LunaSeline/AHMD/blob/main/Prompt_completion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install cmake (if not already installed)
!apt-get install -y cmake

# Install transformers (should be pre-installed, but this ensures it)
!pip install transformers
!pip install gtts

# Clone KenLM (if not already cloned) and build its binaries.
!git clone https://github.com/kpu/kenlm.git || echo "KenLM repository already exists."
!cd kenlm && mkdir -p build && cd build && cmake .. && make -j4
!pip install https://github.com/kpu/kenlm/archive/master.zip

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gtts)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gtts
  Attempting uninstall: click
    Found existing installation: click 8.2.0
    Uninstalling click-8.2.0:
      Successfully uninstalled click-8.2.0
Successfully installed click-8.1.8 gtts-2.5.4
fatal: destination path 'kenlm' already exists and is not an empty directory.
KenLM repository already exists.
-- Could NOT find Eigen3 (missi

In [4]:
import kenlm
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from gtts import gTTS
from io import BytesIO
from IPython.display import Audio, display, HTML

# Load KenLM model
kenlm_model = kenlm.Model('adl_model.klm')
file_path = 'adl_corpus.txt'

def load_corpus(file_path):
    """Load corpus from a text file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file.readlines()]

def kenlm_complete(prompt, max_words=10, corpus_file="adl_corpus.txt", temperature=1.0):
    # Load corpus sentences from the file
    corpus_sentences = load_corpus(corpus_file)

    # Build a vocabulary from the corpus
    vocab = set()
    for sentence in corpus_sentences:
        for word in sentence.split():
            vocab.add(word)
    vocab = list(vocab)

    completion = prompt.strip()
    for _ in range(max_words):
        # Get scores for all words
        candidates = []
        for word in vocab:
            candidate = completion + ' ' + word
            score = kenlm_model.score(candidate, bos=False, eos=False)
            candidates.append((word, score))

        # Sort by score and apply temperature sampling
        candidates.sort(key=lambda x: x[1], reverse=True)

        # Take top candidates based on temperature
        top_n = max(1, int(len(candidates) * 0.1))  # Take top 10% of candidates

        # Apply temperature by adding randomness to selection
        if temperature > 0:
            # Higher temperature = more randomness
            weights = [pow(2, c[1] * temperature) for c in candidates[:top_n]]
            total = sum(weights)
            probabilities = [w/total for w in weights]
            best_word = random.choices(
                [c[0] for c in candidates[:top_n]],
                weights=probabilities,
                k=1
            )[0]
        else:
            # Temperature of 0 means just take the best word
            best_word = candidates[0][0]

        if best_word is None:
            break

        # Avoid repeating words too much
        if best_word in completion.split()[-3:]:
            break

        completion += ' ' + best_word

        # If punctuation is detected, consider stopping early
        if best_word.endswith(('.', '?', '!')):
            if random.random() < 0.7:  # 70% chance to stop after punctuation
                break

    return completion

# Initialize GPT-2 model using DistilGPT2
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
distilgpt_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
generator = pipeline("text-generation", model=distilgpt_model, tokenizer=tokenizer)

def gpt2_complete(prompt, max_new_tokens=20, temperature=0.8, num_return_sequences=1):
    """Complete text using GPT-2 with various temperatures."""
    result = generator(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        num_return_sequences=num_return_sequences,
        return_full_text=False  # Return only the completion
    )
    return [item['generated_text'] for item in result]

# Initialize translation model
model_name = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
mbart_model = MBartForConditionalGeneration.from_pretrained(model_name)
mbart_tokenizer.src_lang = "en_XX"

def translate_to_hindi(text):
    """Translate text to Hindi using mBART."""
    inputs = mbart_tokenizer(text, return_tensors="pt")
    generated_tokens = mbart_model.generate(
        **inputs,
        forced_bos_token_id=mbart_tokenizer.lang_code_to_id["hi_IN"],
        max_length=200
    )
    translation = mbart_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    if isinstance(translation, list):
        translation = translation[0]
    return translation

def text_to_speech(text, lang='en'):
    """Generate speech from text and return Audio object."""
    tts = gTTS(text=text, lang=lang)
    audio_buffer = BytesIO()
    tts.write_to_fp(audio_buffer)
    audio_buffer.seek(0)
    return Audio(audio_buffer.read(), autoplay=False)

def generate_multiple_completions(prompt, num_completions=5):
    """Generate multiple different completions for the same prompt."""
    results = []

    # Get KenLM completions with different temperatures
    for temp in [0.5, 1.0, 1.5]:
        kenlm_result = kenlm_complete(prompt, max_words=10, temperature=temp)
        if kenlm_result.strip() != prompt.strip():
            kenlm_score = kenlm_model.score(kenlm_result, bos=False, eos=False)
            results.append({
                'text': kenlm_result,
                'method': f'KenLM (temp={temp})',
                'score': kenlm_score
            })

    # Get GPT-2 completions with different temperatures
    for temp in [0.7, 1.0, 1.3]:
        gpt_results = gpt2_complete(prompt, temperature=temp, num_return_sequences=2)
        for i, gpt_result in enumerate(gpt_results):
            results.append({
                'text': prompt + " " + gpt_result,
                'method': f'GPT-2 (temp={temp}, seq={i+1})',
                'score': None
            })

    # Add translations and audio
    for result in results:
        result['translation'] = translate_to_hindi(result['text'])

    return results

def display_completions(results):
    """Display all completions with their translations and audio."""
    print(f"Generated {len(results)} different completions:\n")

    display(HTML("<h2>Multiple Text Completions</h2>"))

    for i, result in enumerate(results):
        print(f"\n--- Completion #{i+1} ({result['method']}) ---")
        print(f"English: {result['text']}")
        print(f"Hindi translation: {result['translation']}")

        # Display translated audio
        display(HTML(f"<p><b>Hindi Audio (Completion #{i+1}):</b></p>"))
        hindi_audio = text_to_speech(f"क्या आप ये बोलना चाहते है: {result['translation']}", lang='hi')
        display(hindi_audio)

        # Score info if available
        if result['score'] is not None:
            print(f"KenLM Score: {result['score']:.2f}")

        print("-" * 50)

# Test the function
test_prompt = "I want to eat"
results = generate_multiple_completions(test_prompt)
display_completions(results)

# Bonus - allow users to try different prompts
from IPython.display import clear_output

def interactive_completion():
    prompt = input("Enter a prompt to complete (or 'quit' to exit): ")
    if prompt.lower() == 'quit':
        return

    clear_output()
    print(f"Generating completions for: '{prompt}'")
    results = generate_multiple_completions(prompt)
    display_completions(results)

    # Recursively call for next input
    interactive_completion()

# Uncomment to enable interactive mode
# interactive_completion()

Device set to use cpu


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated 9 different completions:




--- Completion #1 (KenLM (temp=0.5)) ---
English: I want to eat of need sad feeling take soon.
Hindi translation: मैं जल्दी ही दुःख की भावना लेने की जरूरत से खाना चाहता हूं।


KenLM Score: -22.96
--------------------------------------------------

--- Completion #2 (KenLM (temp=1.0)) ---
English: I want to eat take 9 mop without a visit
Hindi translation: मैं बिना किसी मुलाकात के 9 माप ले जाना चाहता हूँ


KenLM Score: -23.53
--------------------------------------------------

--- Completion #3 (KenLM (temp=1.5)) ---
English: I want to eat of a you set to video attend should track a
Hindi translation: मैं एक आप वीडियो भाग लेने के लिए सेट करने के लिए एक ट्रैक करना चाहता हूँ एक


KenLM Score: -31.03
--------------------------------------------------

--- Completion #4 (GPT-2 (temp=0.7, seq=1)) ---
English: I want to eat  more food but we have to make it more food.”



If you're
Hindi translation: मैं अधिक भोजन खाना चाहता हूँ लेकिन हम इसे अधिक भोजन बनाना है। "यदि आप


--------------------------------------------------

--- Completion #5 (GPT-2 (temp=0.7, seq=2)) ---
English: I want to eat  more than you eat, but I can't afford to have to eat more than I can afford to
Hindi translation: मैं तुम खाते से अधिक खाना चाहता हूँ, लेकिन मैं करने के लिए मैं करने के लिए करने के लिए सक्षम नहीं हो सकता है से अधिक खाना चाहता हूँ


--------------------------------------------------

--- Completion #6 (GPT-2 (temp=1.0, seq=1)) ---
English: I want to eat  food, but this would be a problem for me, of food which I've eaten twice in the
Hindi translation: मैं भोजन खाना चाहता हूँ, लेकिन यह मेरे लिए एक समस्या होगी, भोजन जो मैंने दो बार खाया है


--------------------------------------------------

--- Completion #7 (GPT-2 (temp=1.0, seq=2)) ---
English: I want to eat  more," she explained.




"I think if the food thing does it makes
Hindi translation: मैं अधिक खाना चाहता हूँ," उसने स्पष्ट किया. "मैं सोचता हूँ कि यदि भोजन बात करता है यह करता है


--------------------------------------------------

--- Completion #8 (GPT-2 (temp=1.3, seq=1)) ---
English: I want to eat  with him, he needs to get to him, and then that's when you start asking."

Hindi translation: मैं उसके साथ खाना चाहता हूँ, उसे उसके पास जाना चाहिए, और तभी तुम पूछना शुरू करोगे। "


--------------------------------------------------

--- Completion #9 (GPT-2 (temp=1.3, seq=2)) ---
English: I want to eat  this as easy as it seems today because it‭¿ can´t handle it. We
Hindi translation: मैं इसे आज की तरह आसानी से खाना चाहता हूँ क्योंकि यह इसे संभाल नहीं कर सकता है. हम


--------------------------------------------------


In [None]:
import kenlm
import math
import random
import re
import time  # For time.sleep()
from transformers import pipeline, MBartForConditionalGeneration, MBart50TokenizerFast
from gtts import gTTS
from io import BytesIO
from IPython.display import Audio, display

#############################################
# Step 0: Load KenLM and Build Vocabulary
#############################################

# Load your KenLM model.
kenlm_model = kenlm.Model('adl_model.klm')

# Build the vocabulary from your corpus file.
file_path = 'adl_corpus.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    corpus_sentences = [line.strip() for line in file.readlines()]

#############################################
# Step 1: KenLM Sampling Function
#############################################

def kenlm_sampling(prompt, max_words=10, temperature=1.0, top_k=5):
    """
    Generate a raw completion by word-by-word sampling using KenLM scores.
    """
    # Build vocabulary from corpus.
    vocab = set()
    for sentence in corpus_sentences:
        for word in sentence.split():
            vocab.add(word)
    vocab = list(vocab)

    output = prompt.strip()
    for _ in range(max_words):
        scores = []
        # Score each candidate word.
        for word in vocab:
            candidate = output + ' ' + word
            score_val = kenlm_model.score(candidate, bos=False, eos=False)
            scores.append(score_val)

        # Convert scores to probabilities via softmax (with temperature).
        max_score = max(scores)
        adjusted_scores = [(s - max_score) / temperature for s in scores]
        exp_scores = [math.exp(s) for s in adjusted_scores]
        sum_exp = sum(exp_scores)
        probs = [s / sum_exp for s in exp_scores]

        # Limit to top_k words.
        top_indices = sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)[:top_k]
        top_words = [vocab[i] for i in top_indices]
        top_probs = [probs[i] for i in top_indices]

        # Sample one word from top candidates.
        chosen = random.choices(top_words, weights=top_probs, k=1)[0]
        output += ' ' + chosen

        # Stop early if punctuation is appended.
        if chosen.endswith(('.', '?', '!')):
            break
    return output

#############################################
# New Step: Remove Repeated Words
#############################################

def remove_repeated_words(text):
    """
    Remove consecutive duplicate words from the text.
    For example, transforms "my my my" into "my".
    """
    # The regex matches one word followed by at least one repetition of that word.
    # The \b indicates word boundary and the re.IGNORECASE handles case-insensitive matching.
    pattern = re.compile(r'\b(\w+)(\s+\1\b)+', re.IGNORECASE)
    return pattern.sub(r'\1', text)

#############################################
# Step 2: Neural Rewriter (Paraphraser)
#############################################

paraphraser = pipeline("text2text-generation",
                       model="Vamsi/T5_Paraphrase_Paws",
                       tokenizer="Vamsi/T5_Paraphrase_Paws")

def neural_rewrite(text, max_length=50, num_beams=5):
    """
    Rewrite the given text using a neural paraphraser.
    """
    rewriting_prompt = f"paraphrase: {text} </s>"
    rewritten = paraphraser(rewriting_prompt, max_length=max_length, num_beams=num_beams)
    return rewritten[0]['generated_text']

#############################################
# Step 3: Grammar Correction
#############################################

grammar_corrector = pipeline("text2text-generation",
                             model="prithivida/grammar_error_correcter_v1",
                             tokenizer="prithivida/grammar_error_correcter_v1")

def grammar_correct(text, max_length=60):
    """
    Correct the grammar of the given text.
    """
    corrected = grammar_corrector(text, max_length=max_length)
    return corrected[0]['generated_text']

#############################################
# Step 4: Ensure Single Sentence
#############################################

def ensure_single_sentence(text):
    """
    Reduce multi-sentence text down to the first sentence.
    """
    sentences = re.split(r'[.!?]', text)
    for sent in sentences:
        sent = sent.strip()
        if sent:
            return sent + '.'
    return text.strip()

#############################################
# Step 5: Generate Multiple Options
#############################################

def generate_multiple_options(prompt, n_options=3, max_words=10, temperature=1.0, top_k=5,
                              paraphrase_max_length=50, paraphrase_num_beams=5, correct_max_length=60):
    """
    Generate candidate outputs by combining KenLM sampling, neural rewrite, grammar correction,
    and then post-process to remove repetitions and ensure a single sentence.
    """
    options = []
    for _ in range(n_options):
        raw_text = kenlm_sampling(prompt, max_words=max_words, temperature=temperature, top_k=top_k)
        rewritten_text = neural_rewrite(raw_text, max_length=paraphrase_max_length, num_beams=paraphrase_num_beams)
        corrected_text = grammar_correct(rewritten_text, max_length=correct_max_length)
        # Remove repeated words.
        cleaned_text = remove_repeated_words(corrected_text)
        single_sentence = ensure_single_sentence(cleaned_text)
        options.append(single_sentence)
    return options

#############################################
# Step 6: Translation Setup for MBart
#############################################

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer_mbart = MBart50TokenizerFast.from_pretrained(model_name)
model_mbart = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer_mbart.src_lang = "en_XX"

#############################################
# Step 7: Interactive Completion with Audio for Every Option
#############################################

def interactive_completion(prompt, n_options=3, max_words=10, temperature=1.0, top_k=5,
                           paraphrase_max_length=50, paraphrase_num_beams=5, correct_max_length=60):
    """
    Generate candidate outputs (each cleaned and reduced to a single sentence), display the text,
    produce TTS for each candidate and its translation (each followed by a 2-second pause),
    and let the user select one or request more options.
    """
    while True:
        candidates = generate_multiple_options(prompt, n_options=n_options, max_words=max_words,
                                                 temperature=temperature, top_k=top_k,
                                                 paraphrase_max_length=paraphrase_max_length,
                                                 paraphrase_num_beams=paraphrase_num_beams,
                                                 correct_max_length=correct_max_length)
        print("\nCandidate Options (single sentence outputs):\n")
        for idx, option in enumerate(candidates):
            print(f"Option {idx+1}: {option}\n")

            # Produce TTS for the candidate.
            tts_option = gTTS(text="Did you mean: " + option, lang='hi')
            audio_buf_option = BytesIO()
            tts_option.write_to_fp(audio_buf_option)
            audio_buf_option.seek(0)
            print(f"Playing Speech for Option {idx+1}:")
            display(Audio(audio_buf_option.read(), autoplay=True))
            time.sleep(2)  # 2-second gap

            # Translate candidate using MBart.
            inputs_option = tokenizer_mbart(option, return_tensors="pt")
            generated_tokens_option = model_mbart.generate(**inputs_option,
                                                           forced_bos_token_id=tokenizer_mbart.lang_code_to_id["hi_IN"])
            translation_option = tokenizer_mbart.batch_decode(generated_tokens_option, skip_special_tokens=True)
            if isinstance(translation_option, list):
                translation_option = " ".join(translation_option)
            print(f"Translated Option {idx+1}: {translation_option}\n")

            # Produce TTS for the translation.
            tts_translation_option = gTTS(text="क्या आप ये बोलना चाहते हैं: " + translation_option, lang='hi')
            audio_buf_translation = BytesIO()
            tts_translation_option.write_to_fp(audio_buf_translation)
            audio_buf_translation.seek(0)
            print(f"Playing Translated Speech for Option {idx+1}:")
            display(Audio(audio_buf_translation.read(), autoplay=True))
            time.sleep(2)  # 2-second gap

        user_input = input("Enter the option number you prefer (e.g., 1) or type 'n' for more options: ").strip().lower()
        if user_input == 'n':
            print("Generating additional options...\n")
            continue
        try:
            choice = int(user_input)
            if 1 <= choice <= len(candidates):
                return candidates[choice-1]
            else:
                print("Invalid option number, please try again.\n")
        except ValueError:
            print("Invalid input, please enter a valid number or 'n'.\n")

#############################################
# Step 8: Main Execution and Final Audio Output
#############################################

base_prompt = "I want to"
final_output = interactive_completion(base_prompt, n_options=3, max_words=10, temperature=1.0, top_k=5,
                                      paraphrase_max_length=50, paraphrase_num_beams=5, correct_max_length=60)
print("\nFinal accepted output:")
print(final_output)

# Produce TTS for the final accepted output.
tts_final = gTTS(text="Did you mean: " + final_output, lang='hi')
audio_buf_final = BytesIO()
tts_final.write_to_fp(audio_buf_final)
audio_buf_final.seek(0)
print("\nPlaying audio for the final accepted output:")
display(Audio(audio_buf_final.read(), autoplay=True))
time.sleep(2)  # 2-second gap

# Translate the final accepted text.
inputs_final = tokenizer_mbart(final_output, return_tensors="pt")
generated_tokens_final = model_mbart.generate(**inputs_final, forced_bos_token_id=tokenizer_mbart.lang_code_to_id["hi_IN"])
translation_final = tokenizer_mbart.batch_decode(generated_tokens_final, skip_special_tokens=True)
if isinstance(translation_final, list):
    translation_final = " ".join(translation_final)
print("\nTranslated final text:")
print(translation_final)

tts_trans_final = gTTS(text="क्या आप ये बोलना चाहते हैं: " + translation_final, lang='hi')
audio_buf_trans_final = BytesIO()
tts_trans_final.write_to_fp(audio_buf_trans_final)
audio_buf_trans_final.seek(0)
print("\nPlaying translated audio for the final accepted output:")
display(Audio(audio_buf_trans_final.read(), autoplay=True))


Device set to use cpu
Device set to use cpu



Candidate Options (single sentence outputs):

Option 1: I want to text for a bit.

Playing Speech for Option 1:


Translated Option 1: मैं कुछ समय के लिए पाठ करना चाहता हूँ।

Playing Translated Speech for Option 1:


Option 2: Do I want to go to the post office.

Playing Speech for Option 2:


Translated Option 2: क्या मैं डाकघर जाना चाहता हूँ।

Playing Translated Speech for Option 2:


Option 3: I want to get ready for bed.

Playing Speech for Option 3:


Translated Option 3: मैं बिस्तर के लिए तैयार होना चाहता हूँ।

Playing Translated Speech for Option 3:


Enter the option number you prefer (e.g., 1) or type 'n' for more options: 3

Final accepted output:
I want to get ready for bed.

Playing audio for the final accepted output:



Translated final text:
मैं बिस्तर के लिए तैयार होना चाहता हूँ।

Playing translated audio for the final accepted output:
