In [127]:
# Import lirbary for levenshtein calculations
from rapidfuzz.distance import Levenshtein
# Import library for reading CSV files
import csv
import pandas as pd
import random
from pyphonetics import Soundex # Depreceated because of false positives and false negatives
from metaphone import doublemetaphone # https://pypi.org/project/Metaphone/
import os

In [129]:
soundex = Soundex()

In [4]:
path = '/path/to/sorted/csv/dictionaries'

In [164]:
csv_header = ['Word 1', 'Word 2', 'Levenshtein Distance', 'Soundex Match']

In [6]:
# Get all CSV files in the folder

def get_csv_files():
    csv_files = []
    for file in os.listdir(path):
        if file.endswith(".csv"):
            csv_files.append(file)
    return csv_files

In [7]:
# Get all words in the CSV file

def get_data(file):
    with open(file, 'r') as csvfile:
        data = csvfile.readlines()
    return data


In [8]:
# Chose 600 random word pairs from the data

def get_random_word_pairs(data):
    random_word_pairs = []
    for i in range(len(data)):
        random_word_pairs.append(random.choice(data))
    return random_word_pairs


In [163]:
# Calculate the Levenshtein distance and Soundex match for each word pair

def calculate_distances(random_word_pairs):
    json = {'Word 1': [], 'Word 2': [], 'Levenshtein Distance': [], 'Soundex Match': []}
    for i in range(0, len(random_word_pairs), 2):
        json['Word 1'].append(random_word_pairs[i].strip())
        json['Word 2'].append(random_word_pairs[i+1].strip())
        json['Levenshtein Distance'].append(Levenshtein.distance(json['Word 1'][-1], json['Word 2'][-1]))
        # json['Doublemetaphone Match'].append(calculate_doublemetaphone(json['Word 1'][-1], json['Word 2'][-1]))
        json['Soundex Match'].append(soundex.phonetics(json['Word 1'][-1]) == soundex.phonetics(json['Word 2'][-1]))
    return json

In [122]:
def calculate_doublemetaphone(word1, word2):
    if doublemetaphone(word1) == doublemetaphone(word2):
        return True
    elif doublemetaphone(word1)[0] == doublemetaphone(word2)[0]:
        if doublemetaphone(word1)[0] and doublemetaphone(word2)[0] != None:
            return True
        else:
            return False
    elif doublemetaphone(word1)[1] == doublemetaphone(word2)[1]:
        if doublemetaphone(word1)[1] and doublemetaphone(word2)[1] != None:
            return True
        else:
            return False  
    elif doublemetaphone(word1)[0] == doublemetaphone(word2)[1]:
        if doublemetaphone(word1)[0] and doublemetaphone(word2)[1] != None:
            return True
        else:
            return False
    elif doublemetaphone(word1)[1] == doublemetaphone(word2)[0]:
        if doublemetaphone(word1)[1] and doublemetaphone(word2)[0] != None:
            return True
        else:
            return False
    else:
        return False

In [34]:
# Convert the JSON to CSV and save it to the folder

def json_to_csv(json, filename):
  df = pd.DataFrame(json)

  # Get the location of the . in the filename
  try:
    index = filename.index('.')
    letter_index = index - 1

    # if filename == 'homophones0.csv':
    #   letter_index = filename.index('0')

  except ValueError as e:
    print(e)

  df.to_csv('levenshtein_' + filename[letter_index] + '.csv', index=False, header=csv_header)
  return df

In [154]:
# Test code for one specific CSV file

csv_file = get_csv_files()
print(csv_file)
data = get_data(csv_file[21])

if data[0].strip() == '0':
    data.pop(0)

if len(data) %2 != 0:
    data.pop()

random_word_pairs = get_random_word_pairs(data)
json = calculate_distances(random_word_pairs)
df = json_to_csv(json, 'test1_filu.csv')
df.head()

['words_wb1913_b.csv', 'words_wb1913_n.csv', 'words_wb1913_v.csv', 'words_wb1913_w.csv', 'levenshtein_u.csv', 'words_wb1913_j.csv', 'words_wb1913_p.csv', 'words_wb1913_h.csv', 'words_wb1913_g.csv', 'words_wb1913_l.csv', 'words_wb1913_d.csv', 'words_wb1913_f.csv', 'words_wb1913_e.csv', 'words_wb1913_a.csv', 'words_wb1913_s.csv', 'homophones0.csv', 'words_wb1913_r.csv', 'words_wb1913_m.csv', 'words_wb1913_k.csv', 'words_wb1913_new.csv', 'words_wb1913_u.csv', 'words_wb1913_c.csv', 'words_wb1913_t.csv', 'words_wb1913_q.csv', 'words_wb1913_i.csv', 'words_wb1913_z.csv', 'words_wb1913_y.csv', 'words_wb1913_o.csv', 'words_wb1913_x.csv']


Unnamed: 0,Word 1,Word 2,Levenshtein Distance,Doublemetaphone Match
0,Cortices,Cotangent,6,False
1,Copper-fastened,Corrugating,9,False
2,Cappeak,Clysmic,6,False
3,Carbonade,Conventionalism,11,False
4,Chicken-breasted,Cowl,15,False


In [17]:
# # Open the 'homophones_fixed.csv' file
# csv_file = get_csv_files()

# data = get_data(csv_file[19])

# result = []
# reader = csv.reader(data)
# for row in reader:
#     result.extend(row)

# # Save the list as a file with each word separated by a newline
# with open('homophones_fixed_fixed.csv', 'w') as f:
#     for item in result:
#         f.write("%s\n" % item)

In [165]:
# Test code for homophones.csv file without randomization

csv_file = get_csv_files()

# Get index of the file 'hommophones0.csv' in the list of files
index = csv_file.index('homophones0.csv')

data = get_data(csv_file[index])

if data[0].strip() == '0':
    data.pop(0)

if len(data) %2 != 0:
    data.pop()

word_pairs = data
json = calculate_distances(word_pairs)
df = json_to_csv(json, 'levenshtein_homophones.csv')
df.head()

Unnamed: 0,Word 1,Word 2,Levenshtein Distance,Soundex Match
0,Abel,able,3,True
1,Adam,atom,3,True
2,Cain,cane,3,True
3,Chile,chilly,3,True
4,Czech,check,3,False


In [162]:
csv_files = get_csv_files()

for file in csv_files:
    data = get_data(file)

    if data[0].strip() == '0':
        data.pop(0)

    if len(data) %2 != 0:
        data.pop()

    random_word_pairs = get_random_word_pairs(data)
    json = calculate_distances(random_word_pairs)
    df = json_to_csv(json, file)