In [27]:
import pandas as pd

def indic_soundex(word):
    if pd.isna(word) or not word:
        return '0000'
    # Retain the first letter of the word
    first_letter = word[0]

    # Replace all consonants (except the first letter) with digits
    encoded_word = first_letter
    for char in word[1:]:
        if char in indic_soundex_mapping:
            encoded_word += indic_soundex_mapping[char]
        else:
            encoded_word += '0'  # for vowels and unlisted characters

    # Remove all pairs of digits which occur consecutively
    previous_char = ''
    result = first_letter
    for char in encoded_word[1:]:
        if char != previous_char:
            result += char
        previous_char = char

    # Remove all zeros
    result = result.replace('0', '')

    # Pad with zeros or truncate to ensure a length of 4 characters
    result = result[:4].ljust(4, '0')

    return result

# Indic Soundex mapping dictionary
indic_soundex_mapping = {
    # Devanagari script (used for Marathi)
    'अ': '0', 'आ': '0', 'इ': '0', 'ई': '0', 'उ': '0', 'ऊ': '0', 'ऋ': '0',
    'ए': '0', 'ऐ': '0', 'ओ': '0', 'औ': '0', 'अं': '0', 'अः': '0', 'क': '1',
    'ख': '1', 'ग': '1', 'घ': '1', 'ङ': '1', 'च': '2', 'छ': '2', 'ज': '2',
    'झ': '2', 'ञ': '2', 'ट': '3', 'ठ': '3', 'ड': '3', 'ढ': '3', 'ण': '3',
    'त': '4', 'थ': '4', 'द': '4', 'ध': '4', 'न': '4', 'प': '5', 'फ': '5',
    'ब': '5', 'भ': '5', 'म': '5', 'य': '6', 'र': '6', 'ल': '6', 'व': '6',
    'श': '7', 'ष': '7', 'स': '7', 'ह': '7', 'क्ष': '8', 'त्र': '9', 'ज्ञ': '9',

    # Gurmukhi script (used for Punjabi)
    'ਅ': '0', 'ਆ': '0', 'ਇ': '0', 'ਈ': '0', 'ਉ': '0', 'ਊ': '0', 'ਏ': '0',
    'ਐ': '0', 'ਓ': '0', 'ਔ': '0', 'ਕ': '1', 'ਖ': '1', 'ਗ': '1', 'ਘ': '1',
    'ਙ': '1', 'ਚ': '2', 'ਛ': '2', 'ਜ': '2', 'ਝ': '2', 'ਞ': '2', 'ਟ': '3',
    'ਠ': '3', 'ਡ': '3', 'ਢ': '3', 'ਣ': '3', 'ਤ': '4', 'ਥ': '4', 'ਦ': '4',
    'ਧ': '4', 'ਨ': '4', 'ਪ': '5', 'ਫ': '5', 'ਬ': '5', 'ਭ': '5', 'ਮ': '5',
    'ਯ': '6', 'ਰ': '6', 'ਲ': '6', 'ਵ': '6', 'ਸ਼': '7', 'ਸ': '7', 'ਹ': '7',

    # Bengali script (used for Bengali)
    'অ': '0', 'আ': '0', 'ই': '0', 'ঈ': '0', 'উ': '0', 'ঊ': '0', 'ঋ': '0',
    'এ': '0', 'ঐ': '0', 'ও': '0', 'ঔ': '0', 'ক': '1', 'খ': '1', 'গ': '1',
    'ঘ': '1', 'ঙ': '1', 'চ': '2', 'ছ': '2', 'জ': '2', 'ঝ': '2', 'ঞ': '2',
    'ট': '3', 'ঠ': '3', 'ড': '3', 'ঢ': '3', 'ণ': '3', 'ত': '4', 'থ': '4',
    'দ': '4', 'ধ': '4', 'ন': '4', 'প': '5', 'ফ': '5', 'ব': '5', 'ভ': '5',
    'ম': '5', 'য': '6', 'র': '6', 'ল': '6', 'ব': '6', 'শ': '7', 'ষ': '7',
    'স': '7', 'হ': '7',

    # Kannada script
    'ಅ': '0', 'ಆ': '0', 'ಇ': '0', 'ಈ': '0', 'ಉ': '0', 'ಊ': '0', 'ಋ': '0',
    'ಎ': '0', 'ಐ': '0', 'ಒ': '0', 'ಔ': '0', 'ಕ': '1', 'ಖ': '1', 'ಗ': '1',
    'ಘ': '1', 'ಙ': '1', 'ಚ': '2', 'ಛ': '2', 'ಜ': '2', 'ಝ': '2', 'ಞ': '2',
    'ಟ': '3', 'ಠ': '3', 'ಡ': '3', 'ಢ': '3', 'ಣ': '3', 'ತ': '4', 'ಥ': '4',
    'ದ': '4', 'ಧ': '4', 'ನ': '4', 'ಪ': '5', 'ಫ': '5', 'ಬ': '5', 'ಭ': '5',
    'ಮ': '5', 'ಯ': '6', 'ರ': '6', 'ಲ': '6', 'ವ': '6', 'ಶ': '7', 'ಷ': '7',
    'ಸ': '7', 'ಹ': '7', '಺': '8', 'ಽ': '9', 'ಕ್ಷ': '8', 'ಱ': '9',

    # Gujarati script
    'અ': '0', 'આ': '0', 'ઇ': '0', 'ઈ': '0', 'ઉ': '0', 'ઊ': '0', 'ઋ': '0',
    'એ': '0', 'ઐ': '0', 'ઓ': '0', 'ઔ': '0', 'ક': '1', 'ખ': '1', 'ગ': '1',
    'ઘ': '1', 'ઙ': '1', 'ચ': '2', 'છ': '2', 'જ': '2', 'ઝ': '2', 'ઞ': '2',
    'ટ': '3', 'ઠ': '3', 'ડ': '3', 'ઢ': '3', 'ણ': '3', 'ત': '4', 'થ': '4',
    'દ': '4', 'ધ': '4', 'ન': '4', 'પ': '5', 'ફ': '5', 'બ': '5', 'ભ': '5',
    'મ': '5', 'ય': '6', 'ર': '6', 'લ': '6', 'વ': '6', 'શ': '7', 'ષ': '7',
    'સ': '7', 'હ': '7', 'ળ': '8', 'ક્ષ': '8', 'જ્ઞ': '9',

    #Telugu script
    'అ': '0', 'ఆ': '0', 'ఇ': '0', 'ఈ': '0', 'ఉ': '0', 'ఊ': '0', 'ఋ': '0',
    'ఎ': '0', 'ఐ': '0', 'ఒ': '0', 'ఔ': '0', 'క': '1', 'ఖ': '1', 'గ': '1',
    'ఘ': '1', 'ంగ': '1', 'చ': '2', 'ఛ': '2', 'జ': '2', 'ఝ': '2', 'ఞ': '2',
    'ట': '3', 'ఠ': '3', 'డ': '3', 'ఢ': '3', 'ణ': '3', 'త': '4', 'థ': '4',
    'ద': '4', 'ధ': '4', 'న': '4', 'ప': '5', 'ఫ': '5', 'బ': '5', 'భ': '5',
    'మ': '5', 'య': '6', 'ర': '6', 'ల': '6', 'వ': '6', 'శ': '7', 'ష': '7',
    'స': '7', 'హ': '7', 'క్ష': '8', 'ఱ': '9'
}

# List of input files
input_files = ['translated_output_part_21.csv', 'translated_output_part_23.csv', 'translated_output_part_24.csv', 'translated_output_part_25.csv', 'translated_output_part_26.csv'] 

# Process each file
for file in input_files:
    # Read the CSV file
    df_original = pd.read_csv(file)

    # Make a copy of the original dataframe for transformation
    df_transformed = df_original.copy()

    # Apply indic_soundex to the columns
    df_transformed['Marathi'] = df_transformed['Marathi'].apply(indic_soundex)
    df_transformed['Punjabi'] = df_transformed['Punjabi'].apply(indic_soundex)
    df_transformed['Bengali'] = df_transformed['Bengali'].apply(indic_soundex)
    df_transformed['Kannada'] = df_transformed['Kannada'].apply(indic_soundex)
    df_transformed['Gujarati'] = df_transformed['Gujarati'].apply(indic_soundex)
    df_transformed['Telugu'] = df_transformed['Telugu'].apply(indic_soundex)

    # Save the transformed data into a new CSV file
    output_file = file.replace('.csv', '_Indic_Soundex.csv')
    df_transformed.to_csv(output_file, index=False)

    print(f"Indic Soundex codes updated and saved in {output_file}.")

Indic Soundex codes updated and saved in translated_output_part_21_Indic_Soundex.csv.
Indic Soundex codes updated and saved in translated_output_part_23_Indic_Soundex.csv.
Indic Soundex codes updated and saved in translated_output_part_24_Indic_Soundex.csv.
Indic Soundex codes updated and saved in translated_output_part_25_Indic_Soundex.csv.
Indic Soundex codes updated and saved in translated_output_part_26_Indic_Soundex.csv.


In [28]:
def hamming_distance(code1, code2):
    if len(code1) != len(code2):
        raise ValueError("Strings must be of the same length")
    return sum(ch1 != ch2 for ch1, ch2 in zip(code1, code2))

In [29]:
import pandas as pd
import Levenshtein

# Define the Levenshtein distance function
def levenshtein_distance(code1, code2):
    return Levenshtein.distance(code1, code2)

In [30]:
def jaccard_similarity(code1, code2):
    set1 = set(code1)
    set2 = set(code2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

In [31]:
from collections import Counter
from math import sqrt

def cosine_similarity(code1, code2):
    vec1 = Counter(code1)
    vec2 = Counter(code2)
    common = set(vec1) & set(vec2)
    numerator = sum([vec1[x] * vec2[x] for x in common])
    sum1 = sum([vec1[x]**2 for x in vec1])
    sum2 = sum([vec2[x]**2 for x in vec2])
    denominator = sqrt(sum1) * sqrt(sum2)
    if not denominator:
        return 0.0
    return float(numerator) / denominator

In [34]:
# List of input files
input_files = [
    'translated_output_part_21_Indic_Soundex.csv',
    'translated_output_part_23_Indic_Soundex.csv',
    'translated_output_part_24_Indic_Soundex.csv',
    'translated_output_part_25_Indic_Soundex.csv',
    'translated_output_part_26_Indic_Soundex.csv'
] 

# Process each file
for file in input_files:
    # Read the CSV file
    df = pd.read_csv(file)
    
    # Compute similarity metrics for Marathi vs. Punjabi
    df_marathi_punjabi = df[['Marathi', 'Punjabi']].copy()
    df_marathi_punjabi['Hamming_Dist'] = df_marathi_punjabi.apply(lambda row: hamming_distance(row['Marathi'], row['Punjabi']), axis=1)
    df_marathi_punjabi['Levenshtein_Dist'] = df_marathi_punjabi.apply(lambda row: levenshtein_distance(row['Marathi'], row['Punjabi']), axis=1)
    df_marathi_punjabi['Jaccard_Similarity'] = df_marathi_punjabi.apply(lambda row: jaccard_similarity(row['Marathi'], row['Punjabi']), axis=1)
    df_marathi_punjabi['Cosine_Similarity'] = df_marathi_punjabi.apply(lambda row: cosine_similarity(row['Marathi'], row['Punjabi']), axis=1)
    df_marathi_punjabi.to_csv(file.replace('.csv', '_Marathi_Punjabi_output.csv'), index=False)
    
    # Compute similarity metrics for Marathi vs. Bengali
    df_marathi_bengali = df[['Marathi', 'Bengali']].copy()
    df_marathi_bengali['Hamming_Dist'] = df_marathi_bengali.apply(lambda row: hamming_distance(row['Marathi'], row['Bengali']), axis=1)
    df_marathi_bengali['Levenshtein_Dist'] = df_marathi_bengali.apply(lambda row: levenshtein_distance(row['Marathi'], row['Bengali']), axis=1)
    df_marathi_bengali['Jaccard_Similarity'] = df_marathi_bengali.apply(lambda row: jaccard_similarity(row['Marathi'], row['Bengali']), axis=1)
    df_marathi_bengali['Cosine_Similarity'] = df_marathi_bengali.apply(lambda row: cosine_similarity(row['Marathi'], row['Bengali']), axis=1)
    df_marathi_bengali.to_csv(file.replace('.csv', '_Marathi_Bengali_output.csv'), index=False)
    
    # Compute similarity metrics for Marathi vs. Kannada
    df_marathi_kannada = df[['Marathi', 'Kannada']].copy()
    df_marathi_kannada['Hamming_Dist'] = df_marathi_kannada.apply(lambda row: hamming_distance(row['Marathi'], row['Kannada']), axis=1)
    df_marathi_kannada['Levenshtein_Dist'] = df_marathi_kannada.apply(lambda row: levenshtein_distance(row['Marathi'], row['Kannada']), axis=1)
    df_marathi_kannada['Jaccard_Similarity'] = df_marathi_kannada.apply(lambda row: jaccard_similarity(row['Marathi'], row['Kannada']), axis=1)
    df_marathi_kannada['Cosine_Similarity'] = df_marathi_kannada.apply(lambda row: cosine_similarity(row['Marathi'], row['Kannada']), axis=1)
    df_marathi_kannada.to_csv(file.replace('.csv', '_Marathi_Kannada_output.csv'), index=False)
    
    # Compute similarity metrics for Marathi vs. Gujarati
    df_marathi_gujarati = df[['Marathi', 'Gujarati']].copy()
    df_marathi_gujarati['Hamming_Dist'] = df_marathi_gujarati.apply(lambda row: hamming_distance(row['Marathi'], row['Gujarati']), axis=1)
    df_marathi_gujarati['Levenshtein_Dist'] = df_marathi_gujarati.apply(lambda row: levenshtein_distance(row['Marathi'], row['Gujarati']), axis=1)
    df_marathi_gujarati['Jaccard_Similarity'] = df_marathi_gujarati.apply(lambda row: jaccard_similarity(row['Marathi'], row['Gujarati']), axis=1)
    df_marathi_gujarati['Cosine_Similarity'] = df_marathi_gujarati.apply(lambda row: cosine_similarity(row['Marathi'], row['Gujarati']), axis=1)
    df_marathi_gujarati.to_csv(file.replace('.csv', '_Marathi_Gujarati_output.csv'), index=False)
    
    # Compute similarity metrics for Marathi vs. Telugu
    df_marathi_telugu = df[['Marathi', 'Telugu']].copy()
    df_marathi_telugu['Hamming_Dist'] = df_marathi_telugu.apply(lambda row: hamming_distance(row['Marathi'], row['Telugu']), axis=1)
    df_marathi_telugu['Levenshtein_Dist'] = df_marathi_telugu.apply(lambda row: levenshtein_distance(row['Marathi'], row['Telugu']), axis=1)
    df_marathi_telugu['Jaccard_Similarity'] = df_marathi_telugu.apply(lambda row: jaccard_similarity(row['Marathi'], row['Telugu']), axis=1)
    df_marathi_telugu['Cosine_Similarity'] = df_marathi_telugu.apply(lambda row: cosine_similarity(row['Marathi'], row['Telugu']), axis=1)
    df_marathi_telugu.to_csv(file.replace('.csv', '_Marathi_Telugu_output.csv'), index=False)
    
    print(f"Separate CSV files generated for comparisons in {file}.")

Separate CSV files generated for comparisons in translated_output_part_21_Indic_Soundex.csv.
Separate CSV files generated for comparisons in translated_output_part_23_Indic_Soundex.csv.
Separate CSV files generated for comparisons in translated_output_part_24_Indic_Soundex.csv.
Separate CSV files generated for comparisons in translated_output_part_25_Indic_Soundex.csv.
Separate CSV files generated for comparisons in translated_output_part_26_Indic_Soundex.csv.
