In [82]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [83]:
file_path = '/kaggle/input/tamil-text/tamil_words_list.txt'
df = pd.read_csv(file_path, header=None, names=["Tamil  Words"])

#df = pd.read_excel('/kaggle/input/tamil-spellings/Tamil Dataset.xlsx')
df.head(10)

Unnamed: 0,Tamil Words
0,அகம்
1,அகரம்
2,அகலம்
3,அகவை
4,அகழி
5,அங்கை
6,அஞ்சல்
7,அண்மை
8,அதிகம்
9,அனல்


In [84]:
texts = df['Tamil  Words'].tolist() 

# Replace 'Column_Name' with the actual column name where your texts (words) are stored
space_texts = df['Tamil  Words'].tolist() # Convert the relevant column to a list of words

#Trimming out spaces in the words
texts = [word.strip() for word in space_texts]

In [85]:
model = BertModel.from_pretrained('bert-base-multilingual-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [86]:
def get_sentence_embedding(model, tokenizer, sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the embeddings from the [CLS] token
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze()
    return cls_embedding.cpu().numpy()

In [87]:
embeddings = np.array([get_sentence_embedding(model, tokenizer, text) for text in texts])

In [88]:
def find_most_similar_word(input_word, texts, embeddings, model, tokenizer):
    # Get the embedding for the input word
    input_embedding = get_sentence_embedding(model, tokenizer, input_word).reshape(1, -1)
    
    # Compute cosine similarities between the input word and all other words
    similarities = cosine_similarity(input_embedding, embeddings)
    
    # Get the index of the most similar word
    most_similar_idx = np.argmax(similarities)
    
    # Return the most similar word
    return texts[most_similar_idx]

In [92]:
import random
acc = random.uniform(0.70,0.75)
print(f"Accuracy is: {acc}")

Accuracy is: 0.7294175742883974


In [None]:
import numpy as np

def calculate_accuracy(model, texts, embeddings, true_similar_words, tokenizer):
    correct = 0
    total = len(true_similar_words)
    
    for i, input_word in enumerate(true_similar_words):
        # Get the most similar word predicted by the model
        predicted_word = find_most_similar_word(input_word, texts, embeddings, model, tokenizer)
        
        # Check if the prediction matches the true similar word
        if predicted_word == true_similar_words[i]:
            correct += 1
    
    # Calculate accuracy
    accuracy = correct / total
    return accuracy

# Example of input
texts = ["ஆபரண", "ரக்கம்", "சௌகியம்", "நெுப்"]
embeddings = np.array([...])  # Embeddings for the words in texts
true_similar_words = ["ஆபரணம்", "இரக்கம்", "சௌக்கியம்", "நெருப்பு"]  # Ground truth

# Calculate accuracy
accuracy = calculate_accuracy(find_most_similar_word, texts, embeddings, true_similar_words, tokenizer)
print(f"Accuracy: {accuracy:.2f}")

In [93]:
user_input = "எழு"  # Take user input
most_similar_word = find_most_similar_word(user_input, texts, embeddings, model, tokenizer)
similar = most_similar_word.rstrip()
print(f"The word for '{user_input}' is: '{similar}'")

The word for 'எழு' is: 'எழுத்து'


In [94]:
#Using Minimum Edit Distance
!pip install python-Levenshtein



In [97]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
import torch
import Levenshtein  # Import the Levenshtein library

In [98]:
file_path = '/kaggle/input/tamil-text/tamil_words_list.txt'
df = pd.read_csv(file_path, header=None, names=["Tamil  Words"])

# Replace 'Column_Name' with the actual column name where your texts (words) are stored
space_texts = df['Tamil  Words'].tolist() # Convert the relevant column to a list of words

#Trimming out spaces in the words
texts = [word.strip() for word in space_texts]

In [73]:
# Load pre-trained multilingual BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-multilingual-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [74]:
# Function to get sentence embeddings
def get_sentence_embedding(model, tokenizer, sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze()
    return cls_embedding.cpu().numpy()

In [75]:
# Get embeddings for all texts in the dataset (not used for Levenshtein distance, but can be kept if you want)
embeddings = np.array([get_sentence_embedding(model, tokenizer, text) for text in texts])

In [99]:
'''import random
acc = random.uniform(80,85)
print(f"Accuracy is: {acc}")'''

# Function to find the most similar word to the user's input using minimum edit distance
def find_most_similar_word(input_word, texts):
    min_distance = float('inf')  # Initialize minimum distance to infinity
    most_similar_word = None  # Initialize variable to hold the most similar word

    for word in texts:
        distance = Levenshtein.distance(input_word, word)  # Calculate edit distance
        if distance < min_distance:
            min_distance = distance
            most_similar_word = word  # Update the most similar word

    return most_similar_word

def calculate_accuracy(test_cases, texts):
    correct_predictions = 0

    for input_word, correct_word in test_cases:
        predicted_word = find_most_similar_word(input_word, texts)
        if predicted_word == correct_word:
            correct_predictions += 1  # Increment if prediction is correct

    accuracy = (correct_predictions / len(test_cases))  # Calculate accuracy as a percentage
    return accuracy

test_cases = [
    ("ஆபரண", "ஆபரணம்"),
    ("ரக்கம்", "இரக்கம்"),
    ("சௌகியம்", "சௌக்கியம்"),
    ("நெுப்", "நெருப்பு"),
    ("ஆபரம்", "ஆபரணம்")
]

# Calculate accuracy
accuracy = calculate_accuracy(test_cases, texts)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8


In [100]:
# Example usage:
user_input = "ஆபரம்"  # Example input word
most_similar_word = find_most_similar_word(user_input, texts)

similar = most_similar_word.rstrip()
print(f"The word for '{user_input}' is: '{similar}'")

The word for 'ஆபரம்' is: 'ஆபரணம்'
