<a href="https://colab.research.google.com/github/MulukenW/MulukenW/blob/main/AE_CLIR_LSI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Function to get embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Sample data
amharic_docs = [
    "እንኳን ደህና መጣህ",
    "እንዴት ነህ",
    "ሰላም እንዴት ነህ",
    "እንኳን ደህና መጣህ",
    "መልካም ጊዜ ይሁንልኝ"
]
english_docs = [
    "Welcome",
    "How are you",
    "Hello how are you",
    "Welcome back",
    "Have a good time"
]

# Get embeddings for documents
amharic_embeddings = get_embeddings(amharic_docs)
english_embeddings = get_embeddings(english_docs)

# Example query
query = "good time"
query_embedding = get_embeddings([query])[0]  # Ensure query embedding is 1-D

# Cross-lingual retrieval function
def find_similar_documents(query_embedding, embeddings, docs):
    similarities = [1 - cosine(query_embedding, doc_embedding) for doc_embedding in embeddings]
    sorted_docs = sorted(zip(docs, similarities), key=lambda x: x[1], reverse=True)
    return sorted_docs

# Find similar Amharic documents for the English query
results = find_similar_documents(query_embedding, amharic_embeddings, amharic_docs)

# Print results
for doc, similarity in results:
    print(f"Document: {doc}, Similarity: {similarity}")


Document: እንዴት ነህ, Similarity: 0.3853154182434082
Document: መልካም ጊዜ ይሁንልኝ, Similarity: 0.3717384934425354
Document: እንኳን ደህና መጣህ, Similarity: 0.37173840403556824
Document: ሰላም እንዴት ነህ, Similarity: 0.37173840403556824
Document: እንኳን ደህና መጣህ, Similarity: 0.37173840403556824


In [8]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Function to get embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Preprocessing function
def preprocess_text(text):
    # Add language-specific preprocessing here
    return text

# Sample data
amharic_docs = [
    "እንኳን ደህና መጣህ",
    "እንዴት ነህ",
    "ሰላም እንዴት ነህ",
    "እንኳን ደህና መጣህ",
    "መልካም ጊዜ ይሁንልኝ"
]
english_docs = [
    "Welcome",
    "How are you",
    "Hello how are you",
    "Welcome back",
    "Have a good time"
]

# Preprocess documents
amharic_docs = [preprocess_text(doc) for doc in amharic_docs]
english_docs = [preprocess_text(doc) for doc in english_docs]

# Get embeddings for documents
amharic_embeddings = get_embeddings(amharic_docs)
english_embeddings = get_embeddings(english_docs)

# Example query
query = "Hello"
query_embedding = get_embeddings([query])[0]  # Ensure query embedding is 1-D

# Cross-lingual retrieval function
def find_similar_documents(query_embedding, embeddings, docs):
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    sorted_docs = sorted(zip(docs, similarities), key=lambda x: x[1], reverse=True)
    return sorted_docs

# Find similar Amharic documents for the English query
results = find_similar_documents(query_embedding, amharic_embeddings, amharic_docs)

# Print results
for doc, similarity in results:
    print(f"Document: {doc}, Similarity: {similarity}")


Document: እንዴት ነህ, Similarity: 0.41250312328338623
Document: መልካም ጊዜ ይሁንልኝ, Similarity: 0.37651732563972473
Document: እንኳን ደህና መጣህ, Similarity: 0.3765171766281128
Document: ሰላም እንዴት ነህ, Similarity: 0.3765171766281128
Document: እንኳን ደህና መጣህ, Similarity: 0.3765171766281128


In [9]:
# Evaluation function
def evaluate_retrieval(queries, relevant_docs, document_embeddings, documents):
    all_results = []
    for query, relevant in zip(queries, relevant_docs):
        query_embedding = get_embeddings([query])[0]
        results = find_similar_documents(query_embedding, document_embeddings, documents)
        retrieved_docs = [doc for doc, _ in results]
        # Compute precision, recall, etc.
        precision = sum([1 for doc in retrieved_docs if doc in relevant]) / len(retrieved_docs)
        recall = sum([1 for doc in retrieved_docs if doc in relevant]) / len(relevant)
        all_results.append((precision, recall))
    avg_precision = np.mean([res[0] for res in all_results])
    avg_recall = np.mean([res[1] for res in all_results])
    return avg_precision, avg_recall

# Sample queries and relevant documents for evaluation
queries = ["Hello", "Good time"]
relevant_docs = [["ሰላም እንዴት ነህ", "እንዴት ነህ"], ["መልካም ጊዜ ይሁንልኝ"]]

# Evaluate the retrieval system
avg_precision, avg_recall = evaluate_retrieval(queries, relevant_docs, amharic_embeddings, amharic_docs)
print(f"Average Precision: {avg_precision}, Average Recall: {avg_recall}")


Average Precision: 0.30000000000000004, Average Recall: 1.0
