<a href="https://colab.research.google.com/github/Kannan147/demo1/blob/main/Untitled24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
from collections import defaultdict

# Sample parallel corpus (English → French)
corpus = [
    (["the", "house"], ["la", "maison"]),
    (["the", "book"], ["le", "livre"])
]

# Initialize translation probabilities (Uniform distribution)
translation_probs = defaultdict(lambda: defaultdict(lambda: 1.0))
# Initialize transition probabilities (Assume uniform jumps initially)
transition_probs = defaultdict(lambda: defaultdict(lambda: 1.0))

def initialize_probabilities():
    """ Initialize translation and transition probabilities uniformly """
    for eng_sent, fr_sent in corpus:
        eng_vocab = set(eng_sent)
        fr_vocab = set(fr_sent)

        # Uniform translation probabilities
        for e in eng_vocab:
            for f in fr_vocab:
                translation_probs[e][f] = 1.0 / len(fr_vocab)

        # Uniform transition probabilities
        for i in range(len(fr_sent)):
            for j in range(len(eng_sent)):
                transition_probs[i][j] = 1.0 / len(eng_sent)

def expectation_step():
    """ Compute expected counts using current probabilities """
    count_e_f = defaultdict(lambda: defaultdict(float))
    total_f = defaultdict(float)
    count_a = defaultdict(lambda: defaultdict(float))
    total_a = defaultdict(float)

    for eng_sent, fr_sent in corpus:
        len_e = len(eng_sent)
        len_f = len(fr_sent)

        # Compute alignment probabilities for each French word
        alignment_probs = np.zeros((len_f, len_e))
        for j, f in enumerate(fr_sent):
            normalization_factor = 0
            for i, e in enumerate(eng_sent):
                alignment_probs[j][i] = transition_probs[j][i] * translation_probs[e][f]
                normalization_factor += alignment_probs[j][i]

            # Normalize alignment probabilities
            for i, e in enumerate(eng_sent):
                alignment_probs[j][i] /= normalization_factor
                count_e_f[e][f] += alignment_probs[j][i]
                total_f[f] += alignment_probs[j][i]
                count_a[j][i] += alignment_probs[j][i]
                total_a[j] += alignment_probs[j][i]

    return count_e_f, total_f, count_a, total_a

def maximization_step(count_e_f, total_f, count_a, total_a):
    """ Update translation and transition probabilities """
    for e in translation_probs:
        for f in translation_probs[e]:
            translation_probs[e][f] = count_e_f[e][f] / total_f[f] if total_f[f] > 0 else 0

    for j in transition_probs:
        for i in transition_probs[j]:
            transition_probs[j][i] = count_a[j][i] / total_a[j] if total_a[j] > 0 else 0

def train_hmm_mt(iterations=5):
    """ Run the EM algorithm for a given number of iterations """
    initialize_probabilities()
    for _ in range(iterations):
        count_e_f, total_f, count_a, total_a = expectation_step()
        maximization_step(count_e_f, total_f, count_a, total_a)

def viterbi_align(eng_sent, fr_sent):
    """ Perform alignment using the Viterbi algorithm """
    len_e = len(eng_sent)
    len_f = len(fr_sent)
    best_alignment = []

    for j, f in enumerate(fr_sent):
        best_prob = 0
        best_index = 0
        for i, e in enumerate(eng_sent):
            prob = transition_probs[j][i] * translation_probs[e][f]
            if prob > best_prob:
                best_prob = prob
                best_index = i
        best_alignment.append((best_index, j))  # (English index, French index)

    return best_alignment

# Train HMM-based MT model
train_hmm_mt()

# Test alignment using Viterbi decoding
eng_test = ["the", "house"]
fr_test = ["la", "maison"]
alignment = viterbi_align(eng_test, fr_test)

# Print results
print("Word Alignment (English → French):")
for eng_index, fr_index in alignment:
    print(f"{eng_test[eng_index]} → {fr_test[fr_index]}")



Word Alignment (English → French):
the → la
the → maison


In [6]:
import numpy as np
from collections import defaultdict

# Example parallel corpus (English -> French)
corpus = [
    (["the", "house"], ["la", "maison"]),
    (["the", "book"], ["le", "livre"]),
    (["a", "house"], ["une", "maison"]),
    (["a", "book"], ["un", "livre"]),
    (["green", "house"], ["casa", "verde"]),
    (["the", "house"], ["la", "casa"])
]

# Step 1: Initialize translation probabilities uniformly
def initialize_translation_probabilities(corpus):
    translation_probs = defaultdict(lambda: defaultdict(lambda: 1.0))
    for eng_sentence, fr_sentence in corpus:
        for e in eng_sentence:
            for f in fr_sentence:
                translation_probs[e][f] = 1.0 / len(fr_sentence)  # Uniform probability
    return translation_probs

# Step 2 & 3: Expectation-Maximization Algorithm
def train_ibm_model_1(corpus, num_iterations=10):
    # Initialize translation probabilities
    t = initialize_translation_probabilities(corpus)

    for iteration in range(num_iterations):
        count = defaultdict(lambda: defaultdict(float))  # Expected counts
        total = defaultdict(float)  # Total counts for normalization

        # E-Step: Compute expected counts
        for eng_sentence, fr_sentence in corpus:
            for f in fr_sentence:
                # Compute denominator for normalization
                total_s = sum(t[e][f] for e in eng_sentence)
                for e in eng_sentence:
                    count[e][f] += t[e][f] / total_s
                    total[e] += t[e][f] / total_s


        # M-Step: Normalize to get updated probabilities

        for e in count:
            for f in count[e]:
                t[e][f] = count[e][f] / total[e]  # Update translation probability
                # print(e,f,t[e][f])

        # Display progress
        print(f"Iteration {iteration + 1} complete.")

    return t

# Train the IBM Model 1
translation_probs = train_ibm_model_1(corpus, num_iterations=10)

# Print learned translation probabilities
for e in translation_probs:
    print(f"\nTranslations for '{e}':")
    for f, prob in translation_probs[e].items():
        print(f"  P({f} | {e}) = {prob:.4f}")
import numpy as np
from collections import defaultdict

# Example parallel corpus (English -> French)
corpus = [
    (["the", "house"], ["la", "maison"]),
    (["the", "book"], ["le", "livre"]),
    (["a", "house"], ["une", "maison"]),
    (["a", "book"], ["un", "livre"]),
    (["green", "house"], ["casa", "verde"]),
    (["the", "house"], ["la", "casa"])
]

# Step 1: Initialize translation probabilities uniformly
def initialize_translation_probabilities(corpus):
    translation_probs = defaultdict(lambda: defaultdict(lambda: 1.0))
    for eng_sentence, fr_sentence in corpus:
        for e in eng_sentence:
            for f in fr_sentence:
                translation_probs[e][f] = 1.0 / len(fr_sentence)  # Uniform probability
    return translation_probs

# Step 2 & 3: Expectation-Maximization Algorithm
def train_ibm_model_1(corpus, num_iterations=10):
    # Initialize translation probabilities
    t = initialize_translation_probabilities(corpus)

    for iteration in range(num_iterations):
        count = defaultdict(lambda: defaultdict(float))  # Expected counts
        total = defaultdict(float)  # Total counts for normalization

        # E-Step: Compute expected counts
        for eng_sentence, fr_sentence in corpus:
            for f in fr_sentence:
                # Compute denominator for normalization
                total_s = sum(t[e][f] for e in eng_sentence)
                for e in eng_sentence:
                    count[e][f] += t[e][f] / total_s
                    total[e] += t[e][f] / total_s


        # M-Step: Normalize to get updated probabilities

        for e in count:
            for f in count[e]:
                t[e][f] = count[e][f] / total[e]  # Update translation probability
                # print(e,f,t[e][f])

        # Display progress
        print(f"Iteration {iteration + 1} complete.")

    return t

# Train the IBM Model 1
translation_probs = train_ibm_model_1(corpus, num_iterations=10)

# Print learned translation probabilities
for e in translation_probs:
    print(f"\nTranslations for '{e}':")
    for f, prob in translation_probs[e].items():
        print(f"  P({f} | {e}) = {prob:.4f}")
#Share excel including BLEU
#Share the 2 pieces of code
#has context menu

Iteration 1 complete.
Iteration 2 complete.
Iteration 3 complete.
Iteration 4 complete.
Iteration 5 complete.
Iteration 6 complete.
Iteration 7 complete.
Iteration 8 complete.
Iteration 9 complete.
Iteration 10 complete.

Translations for 'the':
  P(la | the) = 0.7352
  P(maison | the) = 0.0171
  P(le | the) = 0.2004
  P(livre | the) = 0.0010
  P(casa | the) = 0.0464

Translations for 'house':
  P(la | house) = 0.0395
  P(maison | house) = 0.5336
  P(une | house) = 0.0024
  P(casa | house) = 0.4244
  P(verde | house) = 0.0001

Translations for 'book':
  P(le | book) = 0.1910
  P(livre | book) = 0.7701
  P(un | book) = 0.0389

Translations for 'a':
  P(une | a) = 0.4906
  P(maison | a) = 0.0576
  P(un | a) = 0.4451
  P(livre | a) = 0.0067

Translations for 'green':
  P(casa | green) = 0.2954
  P(verde | green) = 0.7046
Iteration 1 complete.
Iteration 2 complete.
Iteration 3 complete.
Iteration 4 complete.
Iteration 5 complete.
Iteration 6 complete.
Iteration 7 complete.
Iteration 8 comp