# Name Screening

## Data Analysis

## Table of Contents <a class="anchor" id="toc"></a>

1. [Function Definitions](#func-defs)
    1. [Name Screening Solutions](#first-func-def)
    2. [Plot - Execution Speed](#second-func-def)
    3. [Plot - Number of matches](#third-func-def)
    4. [Plot - Merged Plot](#fourth-func-def)
    5. [Threshold Comparision for distance metric](#fifth-func-def)
    6. [Threshold Analysis for finalizing thresholds](#sixth-func-def)
2. [Load data](#load-data)
3. [Data Analysis](#data-analysis)
    1. [Generate test data](#gen-test)
    2. [Find optimal thresholds](#find-thresholds)
        1. [Finding metrics](#find-thresholds-part1)
        2. [Finding Lower Limit of thresholds](#find-thresholds-part2)
        3. [Finding Upper Limit of thresholds](#find-thresholds-part3)
    3. [Finding Optimal Solution](#find-solution)
4. [Generate performance plots](#gen-plots)

## Libraries

In [None]:
from platform import python_version
print("Python Version:", python_version())

import matplotlib
print("Matplotlib Version:", matplotlib. __version__) 

import warnings
#warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

# pip install abydos
# pip install python-Levenshtein


import re
import os
import time

import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from abydos import phonetic, distance
from Levenshtein import ratio as lev_ratio
from Levenshtein import seqratio as lev_seqratio
from Levenshtein import setratio as lev_setratio

# 1. Function Definitions <a class="anchor" id="func-defs"></a>

Go to [Table of Contents](#toc)

## 1.1. Function Definition - All name screening solutions <a class="anchor" id="first-func-def"></a>

Go to [Table of Contents](#toc)

The function provided **FIVE** options. They are as follows:
1. Applying Levenshtein set ratio on the names directly ***(Traditional approach)***
2. Applying Levenshtein set ratio on the phonemes of the names
3. Using custom Levenshtein ratio to measure similarity between the phonemes of the names
4. Using BOTH Levenshtein set ratio on names AND custom Levenshtein distance on phonemes of the names for comparision ***(Proposed approach 1)***
5. Using EITHER Levenshtein set ratio on names OR custom Levenshtein ratio on phonemes of the names for comparision ***(Proposed approach 2)***

In [None]:
def solutions(sol_type, db, func, thres, name, actual_name, enable_prints=True):
    
    print()
    sol_name = ""
    if sol_type == 1:
        print('Solution 1 - Levenshtein set ratio on name matching')
        sol_name = "Baseline Solution"
    elif sol_type == 2:
        print('Solution 2 - Levenshtein set ratio on Phoneme matching')
        sol_name = "Intermediate Solution 1"
    elif sol_type == 3:
        print('Solution 3 - Custom Levenshtein Ratio on Phonemes')
        sol_name = "Intermediate Solution 2"
    elif sol_type == 4:
        print('Solution 4 - Levenshtein set ratio AND Custom Levenshtein Ratio on Phonemes')
        sol_name = "Proposed Solution 1"
    elif sol_type == 5:
        print('Solution 5 - Levenshtein set ratio OR Custom Levenshtein Ratio on Phonemes')
        sol_name = "Proposed Solution 2"
    else:
        print('Invalid Option! Choose from 1 to 5')
        return None
    
    print('Searched Name:', name, '\nActual Name:', actual_name)
    
    results = pd.DataFrame()
    
    start_time= time.time()
    
    if sol_type != 1:
        pn = []
        for t in name.split():
            pn.append(phonetic.DoubleMetaphone().encode(t)[0])
            
        if enable_prints:
            print(f"Phoneme for {name} is : {pn}")
    
    for row in db.iterrows():
        
        if sol_type == 1:
            metric = func(row[1]['Name'].lower(), name.lower())
        elif sol_type == 2:
            metric = func(' '.join(row[1]['Phonemes']).lower(), ' '.join(pn).lower())
        elif sol_type == 3:
            dist_score = []
            dist_score = [max([(func(i,j)) for j in row[1]['Phonemes']]) for i in pn]
            metric = np.mean(dist_score)
        elif sol_type == 4 or sol_type == 5:
            metric1 = func[0](row[1]['Name'].lower(), name.lower())
            dist_score = []
            dist_score = [max([(func[1](i,j)) for j in row[1]['Phonemes']]) for i in pn]
            metric2 = np.mean(dist_score)
        
        if sol_type == 1 or sol_type == 2 or sol_type == 3:
            condition = (metric >= thres)
        elif sol_type == 4:
            condition = (metric1 >= thres[0] and metric2 >= thres[1])
        elif sol_type == 5:
            condition = (metric1 >= thres[0] or metric2 >= thres[1])

        
        if condition:
            
            if sol_type == 1:
                dist = np.round(lev_ratio(row[1]['Name'].lower(), name.lower()), 2)
            elif sol_type == 2:
                dist = np.round(lev_ratio(' '.join(row[1]['Phonemes']).lower(), ' '.join(pn).lower()), 2)
            
            if sol_type == 1 or sol_type == 2:
                df2 = {'Name': row[1]['Name'], 
                       'LevSetRatio': metric, 
                       'Distance': dist}
            elif sol_type == 3:
                df2 = {'Name': row[1]['Name'], 
                       'Phoneme': row[1]['Phonemes'], 
                       'LevRatio': metric}
            elif sol_type == 4 or sol_type == 5:
                df2 = {'Name': row[1]['Name'], 
                       'Phoneme': row[1]['Phonemes'], 
                       'LevSetRatio': metric1, 
                       'LevRatio': metric2}

            results = results.append(df2, ignore_index = True)

    fin_time = np.round((time.time() - start_time), 2)
    print(f"--- Execution Time: {fin_time:,} seconds ---")
    
    if sol_type == 1 or sol_type == 2:
        if not results.empty:
            results.sort_values('LevSetRatio', ascending=False, inplace=True)
    elif sol_type == 3:
        if not results.empty:
            results.sort_values('LevRatio', ascending=False, inplace=True)
    elif sol_type == 4 or sol_type == 5:
        if not results.empty:
            results.sort_values(['LevRatio', 'LevSetRatio'], ascending=False, inplace=True)
    
    results.reset_index(drop=True, inplace=True)
    if enable_prints:
        print("Number of matched names:", results.shape[0])
    
    actual_present = False
    if not results.empty:
        if results[results.Name.str.contains(actual_name)].shape[0]:
            actual_present = True
            if enable_prints:
                print('Results contain the actual name!')
                display(results[results.Name.str.contains(actual_name)])
        else:
            actual_present = False
            if enable_prints:
                print('Results do not contain the actual name...')

    return sol_name, results, actual_present, fin_time



## 1.2. Function Definition - Plot horizonal bar for comparing execution speed <a class="anchor" id="second-func-def"></a>

Go to [Table of Contents](#toc)

In [None]:
def show_exec_speed(results, searched_name, actual_name):
    fig, ax = plt.subplots(figsize=(10,3))

    sols = [val[0] for val in results]
    times = [val[3] for val in results]
    perc_change = [" "+str(np.round((val-times[0])*100/times[0], 2))+" %" for val in times]
    bar_colors = ['green' if val[2] else 'red' for val in results]


    ax.barh(sols, times, color=bar_colors)

    for index, val in enumerate(times):
        plt.text(val, index, val, 
                 color = 'black', horizontalalignment='left', verticalalignment='center')
        if index == 0:
            continue
        plt.text(val, index, perc_change[index], 
                 color = 'white', horizontalalignment='right', verticalalignment='center')

    ax.set_yticklabels(sols)

    # Labels are inverted by default
    ax.invert_yaxis()  

    ax.set_xlabel('Fetch time (seconds)')
    ax.set_title(f'Execution speed while searching for "{actual_name}" using "{searched_name}" ')
    plt.grid(True, axis = 'x')


    green_patch = mpatches.Patch(color='green', label='Name matched')
    red_patch = mpatches.Patch(color='red', label='Name not matched')
    ax.legend(handles=[green_patch, red_patch])

    plt.show()

    return None


## 1.3. Function Definition - Plot merged bar and line plot to compare number of false positive matches <a class="anchor" id="third-func-def"></a>

Go to [Table of Contents](#toc)

In [None]:
def show_performance_plots(df, focal_entity , limits, xlabels, figsize):
    ax1 = sns.set_style(style=None, rc=None )

    fig, ax1 = plt.subplots(figsize=figsize)

    g = sns.lineplot(data = df['Time'], marker='o', sort = False, ax=ax1, label="Execution Time")
    g.set(ylim=limits[0])
    ax2 = ax1.twinx()
    for index, row in df.iterrows():
        g.text(row.name, 
                   row['Time'], 
                   np.round(row['Time'], 2), 
                   color='black', 
                   ha="center", 
                   size='large', 
                   verticalalignment='bottom')

    bplot = sns.barplot(data = df, x=focal_entity, y='Num_of_matches', alpha=0.5, ax=ax2, label="Number of matches")
    bplot.set(ylim=limits[1])
    for index, row in df.iterrows():
        bplot.text(row.name, 
                   row['Num_of_matches'], 
                   np.round(row['Num_of_matches'], 2), 
                   color='black', 
                   ha="center", 
                   size='large', 
                   verticalalignment='baseline')

    ax1.set_xticklabels(df[focal_entity], rotation=90)
    ax1.set_xlabel(xlabels)
    ax1.set_ylabel('Execution Time (seconds)')
    ax2.set_ylabel('Number of matches')
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper right')
    
    plt.show()
    
    return None

## 1.4. Function Definition - Plot bar plot of accuracy of each model <a class="anchor" id="fourth-func-def"></a>

Go to [Table of Contents](#toc)

In [None]:
def show_TP_perc_plot(df, focal_entity, ylims, figsize):
    
    plt.figure(figsize=figsize)
    g = sns.barplot(data = df, x=focal_entity, y='Name_found', alpha=0.5)
    for index, row in df.iterrows():
        g.text(row.name, 
                   row['Name_found'], 
                   np.round(row['Name_found'], 2), 
                   color='black', 
                   ha="center", 
                   size='large', 
                   verticalalignment='baseline')
    g.set_ylabel('Percentage of TRUE POSTIVE (%)')
    g.set(ylim=ylims)
    plt.xticks(rotation=45)
    plt.grid(True, axis='y')
    plt.title('Percentage of True Positive')
    plt.show()
    
    return None

## 1.5. Function Definition - Threshold analysis for phonemes and names <a class="anchor" id="fifth-func-def"></a>

Go to [Table of Contents](#toc)

In [None]:
def thres_analysis1(maindb, tests):
    
    results = pd.DataFrame()
    
    for test in tests:
        
        actual_name = test[0]
        
        for name in test[1]:
            #print()
            #print('Searched Name:', name, '\nActual Name:', actual_name)

            pn = []
            for t in name.split():
                pn.append(phonetic.DoubleMetaphone().encode(t)[0])

            #print(f"Phoneme for {name} is : {pn}")
            

            db = maindb[maindb.Name.str.contains(actual_name)]
            for row in db.iterrows():


                #metric1 = func[0](row[1]['Name'], name)
                dist_score = []
                metric0 = lev_ratio(' '.join(row[1]['Phonemes']).lower(), ' '.join(pn).lower())
                dist_score = [max([(lev_ratio(i,j)) for j in row[1]['Phonemes']]) for i in pn]
                metric1 = np.mean(dist_score)
                metric2 = lev_seqratio(row[1]['Phonemes'], pn)
                metric3 = lev_setratio(row[1]['Phonemes'], pn)
                #print([name, , , metric2])


                df2 = {'Name': row[1]['Name'],
                       'Searched': name, 
                       #'Actual': actual_name, 
                       'Phoneme': row[1]['Phonemes'], 
                       'Searched Phoneme': pn, 
                       'Levenshtein Ratio': metric0, 
                       'Custom Levenshtein Ratio': metric1, 
                       'Levenshtein Seq Ratio': metric2, 
                       'Levenshtein Set Ratio': metric3}

                results = results.append(df2, ignore_index = True)


    results.reset_index(drop=True, inplace=True)

    return results



In [None]:
def thres_analysis2(maindb, func, tests):
    
    results = pd.DataFrame()
    
    for test in tests:
        
        actual_name = test[0]
        
        for name in test[1]:
            #print()
            #print('Searched Name:', name, '\nActual Name:', actual_name)

            pn = []
            for t in name.split():
                pn.append(phonetic.DoubleMetaphone().encode(t)[0])

            #print(f"Phoneme for {name} is : {pn}")
            

            db = maindb[maindb.Name.str.contains(actual_name)]
            for row in db.iterrows():


                #metric1 = func[0](row[1]['Name'].lower(), name.lower())
                metric0 = lev_ratio(row[1]['Name'].lower(), name.lower())
                metric1 = np.mean([max([(func[1](i,j)) for j in name.lower().split()]) for i in row[1]['Name'].lower().split()])
                metric2 = lev_seqratio(row[1]['Name'].lower(), name.lower())
                metric3 = lev_setratio(row[1]['Name'].lower(), name.lower())
                dist_score = []
                #dist_score = [max([(func[1](i,j)) for j in row[1]['Phonemes']]) for i in pn]
                #metric2 = func[1](row[1]['Name'].lower(), name.lower())
                #print([name, lev_seqratio(row[1]['Phonemes'], pn), lev_setratio(row[1]['Phonemes'], pn), metric2])


                df2 = {'Name': row[1]['Name'],
                       'Searched': name, 
                       #'Actual': actual_name, 
                       'Phoneme': row[1]['Phonemes'], 
                       'Searched Phoneme': pn, 
                       'Lev Ratio': metric0, 
                       'Custom Lev Ratio': metric1, 
                       'Lev Seq Ratio': metric2, 
                       'Lev Set Ratio': metric3}

                results = results.append(df2, ignore_index = True)


    results.reset_index(drop=True, inplace=True)

    return results



## 1.6. Function Definition - Threshold finalization for phonemes and names metrics <a class="anchor" id="sixth-func-def"></a>

Go to [Table of Contents](#toc)

In [None]:
def thres_analysis(maindb, tests):
    
    results = pd.DataFrame()
    
    for test in tests:
        
        actual_name = test[0]
        
        for name in test[1]:
            #print()
            #print('Searched Name:', name, '\nActual Name:', actual_name)

            pn = []
            for t in name.split():
                pn.append(phonetic.DoubleMetaphone().encode(t)[0])

            #print(f"Phoneme for {name} is : {pn}")
            

            db = maindb[maindb.Name.str.contains(actual_name)]
            for row in db.iterrows():


                #metric1 = func[0](row[1]['Name'].lower(), name.lower())
                metric1 = lev_setratio(row[1]['Name'].lower(), name.lower())
                metric2 = np.mean([max([(lev_ratio(i,j)) for j in name.lower().split()]) for i in row[1]['Name'].lower().split()])
                dist_score = []
                #dist_score = [max([(func[1](i,j)) for j in row[1]['Phonemes']]) for i in pn]
                #metric2 = func[1](row[1]['Name'].lower(), name.lower())
                #print([name, lev_seqratio(row[1]['Phonemes'], pn), lev_setratio(row[1]['Phonemes'], pn), metric2])


                df2 = {'Name': row[1]['Name'],
                       'Searched': name, 
                       #'Actual': actual_name, 
                       'Phoneme': row[1]['Phonemes'], 
                       'Searched Phoneme': pn,                         
                       'Lev Set Ratio': metric1, 
                       'Custom Lev Ratio': metric2}

                results = results.append(df2, ignore_index = True)


    results.reset_index(drop=True, inplace=True)

    return results



In [None]:
def thres_analysis_unknown(flag, maindb, func, tests):
    
    start_time = time.time()
    results = pd.DataFrame()
    
    for name in tests[-1][1]:
        

        pn = []
        for t in name.split():
            pn.append(phonetic.DoubleMetaphone().encode(t)[0])

        db = maindb
        print(name)
        counter = 0
        for row in db.iterrows():

            metric1 = None
            metric2 = None
            if flag == 1:
                metric1 = func[0](row[1]['Name'].lower(), name.lower())
                condition = metric1 >=0.461
            elif flag==2:
                dist_score = [max([(func[1](i,j)) for j in row[1]['Phonemes']]) for i in pn]
                metric2 = np.mean(dist_score)
                condition = metric2 >=0.379
                
            if condition:
                counter += 1
                if counter%2500 == 0:
                    print(counter)
                df2 = {'Name': row[1]['Name'],
                       'Searched': name, 
                       #'Actual': actual_name, 
                       'Phoneme': row[1]['Phonemes'], 
                       'Searched Phoneme': pn, 
                       'Lev Set Ratio': metric1, 
                       'Custom Lev Ratio': metric2}

                results = results.append(df2, ignore_index = True)

    print(f'Execution Time: {time.time()-start_time:,} seconds')
    results.reset_index(drop=True, inplace=True)

    return results


# 2. Load Data <a class="anchor" id="load-data"></a>

Go to [Table of Contents](#toc)

In [None]:
names = pd.read_pickle('Final_Names.pkl')
names

# 3. Analysis <a class="anchor" id="data-analysis"></a>

Go to [Table of Contents](#toc)

## 3.1 Generate test data <a class="anchor" id="gen-test"></a>

Go to [Table of Contents](#toc)

In [None]:
names.iloc[np.random.randint(names.shape[0])]

In [None]:
names[names.Name.str.contains(tests[5][0])]

In [None]:
tests = [["Vladimir Nikolaevich Terentev", ["Vadimir Nikolevi Terente", 
                                            "Valdimi Terente", 
                                            "Baldimi Nikola Terete", 
                                            "Wadimi Terente", 
                                            "Baldimir"]], 
         ["Andrey Alshevskih", ["andre alshveck", 
                                "andrew alshvek", 
                                "andy alshevsik", 
                                'andy alsevsikh']],
         ["Andrei Skoch", ['andrew skok', 
                           'andre skokh', 
                           'andrai skosh']], 
         ["Sheikh Aboud Rogo", ["Shake Abud rogo", 
                                "Sheik abod roguo", 
                                'about roguo', 
                                'shaikh rogo']], 
         ["Mohammad Fayez Al-Barsha", ["Mohamad Fayis Barsa", 
                                       "Fayes Barhsa", 
                                       "Mohamed Al barsha", 
                                       'Moamad faes albasha', 
                                       'mohamad fayiz']], 
         ["Wael Mohamad ABED AL RAHMAAN", ["Wel Mohammad ABID RAHMAN", 
                                           "Wael AL-HUSSEINI", 
                                           "Wel Huseni", 
                                           "Mohammad Rehman"]], 
         ["Khaled Suleiman Fayez ABOU HASSAN", ["Khalid Sulayman Fayiz ABU HASAN", 
                                                "Khaled ABU HASSAN", 
                                                "Kahleed Asan", 
                                                "Kohlid Faye Hasin"]], 
         ["David Amos Mazengo", ["Dave Masengo", 
                                 "Daveid Mos Masingo", 
                                 "Dave Masego", 
                                 "Daive Masego"]], 
         ["Shahidwror", ["Shahidwar", 
                         "Shahid war", 
                         "shahedvar", 
                         "sahaedvor"]], 
         ["Fares Chihabi", ["Faresh Chiabi", 
                            "Farez Chivi", 
                            "Faresh Jiavi", 
                            "Farsh Jiabi"]], 
#         ["Cathy Hale", ["Katie Hall", 
#                         "Kathy Hail", 
#                         "Kathy Hall", 
#                         "Cathie Hayl", 
#                         "Catie Hail"]], 
#         ["Brent Miller", ["Brend Miler", 
#                           "Brant Miller", 
#                           "Brend Milla", 
#                           "Brent Milder", 
#                           "Braynt Milla"]], 
#         ["Joseph Bolton", ["Josef Baultan", 
#                            "Josev Boltan", 
#                            "Joseph Baton", 
#                            "Josh Bolton", 
#                            "josaif boton"]], 
#         ["Mariah Cortez", ["Maria Cortes", 
#                            "Mary Gorres", 
#                            "Maraya Caurtes", 
#                            "Mariya Coddes"]], 
         ["DUMMY DATA", ["Gaurav Roy", 
                         "Aleh Vladimir Aziz", 
                         "Dipyaman Choudhury", 
                         "Suner Lahiri", 
                         "Rajendra Kumar", 
                         "Nishikrin Kyoji", 
                         "Rodrigue Patterson", 
                         "Anya D'Souza", 
                         "Joshua Matthers", 
                         "Michael Myers"]]]

## 3.2. Find the optimal thresholds <a class="anchor" id="find-thresholds"></a>

Go to [Table of Contents](#toc)

### 3.2.1. Find the metrics <a class="anchor" id="find-thresholds-part1"></a>

Go to [Table of Contents](#toc)

### Compare Abydos and python-Levenshtein

In [None]:
# Abydos - Distance
stat_time = time.time()
for test in tests:
    for i in test[1]:
        (distance.dist_levenshtein(test[0], i))
print(f'Execution Speed: {np.round((time.time()-stat_time)*1000, 4)} ms')

In [None]:
# Levenshtein 
stat_time = time.time()
for test in tests:
    for val in test[1]:
        (lev_ratio(test[0], val))
print(f'Execution Speed: {np.round((time.time()-stat_time)*1000, 4)} ms')

In [None]:
perc = np.arange(0,1, 0.1)
res = thres_analysis1(names, tests)
display(res.describe(percentiles=perc).T)

In [None]:
funcs = [[lev_ratio, lev_ratio], 
         [lev_ratio, lev_seqratio], 
         [lev_ratio, lev_setratio]]
perc = np.arange(0,1, 0.1)
func_names = ["Ratio", "Token Sort Ratio", 
              "Token Set Ratio", "Partial Ratio",
              "Partial Token Sort Ratio", "Partial Token Set Ratio"]
#sub_name = names[names.Name.str.contains(actual_name)]
#display(sub_name)

res = thres_analysis2(names, func, tests)
display(res.describe(percentiles=perc).T)

From the results above, it is visible that the best metrics are as follows:
- Best name comparision metric : **Levenshtein Set Ratio**
- Best phoneme comparision metric : **Custom Levenshtein ratio**

Therefore, using these to finalize the thresholds for the metric...

### 3.2.2. Find the Lower Limit using True Positive data <a class="anchor" id="find-thresholds-part2"></a>

Go to [Table of Contents](#toc)

In [None]:
res = thres_analysis(names, tests)
#res.sort_values(['Lev Set Ratio'])
#res[res['Lev Set Ratio']<0.75]
#res.sort_values(['Custom Lev Ratio'])
#res[res['Custom Lev Ratio']<60]
perc1 = np.arange(0,0.1, 0.05)
perc2 = np.arange(0.1, 1, 0.1)
perc = np.concatenate([perc1, perc2])
display(res.describe(percentiles=perc).T)

In [None]:
res = thres_analysis(names, tests)
#res.sort_values(['Lev Set Ratio'])
#res[res['Lev Set Ratio']<0.75]
#res.sort_values(['Custom Lev Ratio'])
#res[res['Custom Lev Ratio']<60]
perc = np.arange(0.5,1, 0.05)
display(res.describe(percentiles=perc).T)

From the observations, to catch the variations, it is best to set the thresholds as follows:
- Levenshtein Set Ratio = **0.461**
- Custom Levenshtein ratio = **0.379**

The next stage looks at increasing the limits, for reducing the false positives as much as possible...

### 3.2.3. Find the Upper Limit using False Positive data <a class="anchor" id="find-thresholds-part3"></a>

Go to [Table of Contents](#toc)

In [None]:

funcs = [[lev_setratio, lev_ratio]]

perc = np.arange(0,1, 0.1)

#sub_name = names[names.Name.str.contains(actual_name)]
#display(sub_name)

for func in funcs:
    print(func)
    res1 = thres_analysis_unknown(1, names, func, tests)
    display(res1.describe(percentiles=perc).T)

In [None]:
perc = np.arange(0,0.9, 0.1)
perc2 = np.arange(0.9, 1, 0.05)
perc = np.concatenate([perc, perc2])
display(res1.describe(percentiles=perc).T)

In [None]:
res1.sort_values('Lev Set Ratio')

In [None]:
funcs = [[lev_setratio, lev_ratio]]


perc = np.arange(0,1, 0.1)

#sub_name = names[names.Name.str.contains(actual_name)]
#display(sub_name)

for func in funcs:
    print(func)
    res22 = thres_analysis_unknown(2, names, func, tests)
    display(res22.describe(percentiles=perc).T)

In [None]:
perc = np.arange(0,0.9, 0.1)
perc2 = np.arange(0.9, 1, 0.05)
perc = np.concatenate([perc, perc2])
display(res2.describe(percentiles=perc).T)

In [None]:
res2.sort_values('Custom Lev Ratio')

In [None]:
perc1 = np.arange(0, 0.1, 0.05)
perc2 = np.arange(0.1,0.9, 0.1)
perc3 = np.arange(0.9, 1, 0.05)
perc = np.concatenate([perc1, perc2, perc3])
display(res1.describe(percentiles=perc).T)
display(res2.describe(percentiles=perc).T)

The earlier thresholds suggestions were kept as:
- Levenshtein Set Ratio = *0.461*
- Custom Levenshtein ratio = *0.379*

However, from observating the multiple iterations, it is best to set the thresholds as follows *(keeping around the 95% percentile)*:
- Levenshtein Set Ratio = **0.667**
- Custom Levenshtein ratio = **0.619**

## 3.3. Finding best solution <a class="anchor" id="find-solution"></a>

Go to [Table of Contents](#toc)

Performance is compared on all the test sets using the finalized metrics **Levenshtein Set Ratio & Custom Levenshtein Ratio** and using the corresponding thresholds of **0.667 & 0.619**. The different solutions are listed below:
1. Using Levenshtein set ratio on the names directly ***(Traditional approach)***
2. Using Levenshtein set ratio on the phonemes of the names
3. Using custom Levenshtein ratio to measure similarity between the phonemes of the names
4. Using BOTH Levenshtein set ratio on names AND custom Levenshtein ratio on phonemes of the names for comparision ***(Proposed approach 1)***
5. Using EITHER Levenshtein set ratio Levenshtein set ratioon names OR custom Levenshtein ratio on phonemes of the names for comparision ***(Proposed approach 2)***

In [1]:
funcs = [lev_setratio, 
         lev_setratio, 
         lev_ratio, 
         [lev_setratio, lev_ratio], 
         [lev_setratio, lev_ratio]]


thresholds = [0.667, 0.667, 0.619, [0.667, 0.619], [0.667, 0.619]]

final_results = []

df = pd.DataFrame(columns={'Solution', 'Actual', 'Searched', 'Num_of_matches', 'Name_found', 'Time'})

start_time = time.time()
for i in range(len(tests[:-1])):
    actual_name = tests[i][0]
    display(names[names.Name.str.contains(actual_name)])
    for j in range(len(tests[i][1])):
        searched_name = tests[i][1][j]

        for k in range(5):
            res = solutions(k+1, names, funcs[k], thresholds[k], searched_name, actual_name, False)
            if res is not None:
                final_results.append(res)
        
        df2 = pd.DataFrame({"Solution": [val[0] for val in final_results], 
                            "Actual": actual_name, 
                            "Searched": searched_name, 
                            "Num_of_matches": [val[1].shape[0] for val in final_results],
                            "Name_found": [val[2] for val in final_results], 
                            "Time": [val[3] for val in final_results]})
        
        df = pd.concat([df, df2], ignore_index=True)
        
        
print(f'---- Total Execution Time: {time.time() - start_time} seconds ----')

NameError: name 'lev_setratio' is not defined

In [None]:
df

In [None]:
#show_merged_plot(df)
df.to_pickle('Temp_save_data3.pkl')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
import winsound
duration = 1000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)

# 4. Generating Performance Plots <a class="anchor" id="gen-plots"></a>

Go to [Table of Contents](#toc)

In [None]:
#df.to_pickle('Temp_save_data2.pkl')
df = pd.read_pickle('Temp_save_data3.pkl')
df.Num_of_matches = df.Num_of_matches.astype('int64')
df.Name_found = df.Name_found.astype('bool')
df.dtypes


In [None]:
df

In [None]:
df2 = df.groupby(['Solution']).mean().reset_index()
df2['Name_found'] = df2['Name_found']*100
df2

In [None]:
df3 = df[(df['Solution']=='Baseline Solution')].groupby(['Actual']).mean().reset_index()
df3['Name_found'] = df3['Name_found']*100

df3

df4 = df[(df['Solution']=='Proposed Solution 1')].groupby(['Actual']).mean().reset_index()
df4['Name_found'] = df4['Name_found']*100

df4

In [None]:
#df22 = df2[~df2.Solution.str.contains('Intermediate')]
show_performance_plots(df2, 'Solution', [(0,4),(0,1000)], "List of solutions", (20,5))
show_TP_perc_plot(df2, 'Solution', (40,105), (20,5))


In [None]:
show_performance_plots(df3, 'Actual', [(0,7),(0,1000)], "List of Actuals", (20,5))
show_TP_perc_plot(df3, 'Actual', (30,90), (20,5))


In [None]:
show_performance_plots(df4, 'Actual', [(0,7),(0,300)], "List of Actuals", (20,5))
show_TP_perc_plot(df4, 'Actual', (30,90), (20,5))