# Fuzzy Module

## Initialisation

Basic approach to determine the project directory

In [1]:
import Levenshtein

from pyphonetics import Soundex

import unittest

from common import Printable

## Nicknames and Exclusions

TODO - move these lists into external files in projdir/config

In [2]:
# Nickname processing happens first
nicknames = \
{
    'Andy': 'Andrew',
    'Bob': 'Robert',
    'Charlie': 'Charles',
    'Dan': 'Daniel',
    'Dave': 'David',
    'Ed': 'Edward',
    'Eddie': 'Edward',
    'Jim': 'James',
    'Joe': 'Joseph',
    'Matt': 'Matthew',
    'Mike': 'Michael',
    'Pete': 'Peter',
    'Phil': 'Philip',
    'Rob': 'Robert',
    'Si': 'Simon',
    'Steve': 'Steven',
    'Stephen': 'Steven',
    'Tim': 'Timothy',
    'Will': 'William'
}

# Exclusions are applied after nicknames
exclusions = \
{
    'Michael Price': 'Michael Pearce',
    'Ruben Hofman': 'Robin Hofman',
    'Richard Powell': 'Richard Peel',
    'Simon Hemsley': 'Simon Hinkley'
}

## Fuzzy Match Class

Generic class to fuzzy match two names

In [3]:
class FuzzyMatch(Printable):
    def __init__(self, verbosity=1):

        self.verbosity = verbosity


    def matchNames(self, nameOne, nameTwo):
        '''Simple method to determine of two names are a likely match'''

        # Quick hack to return quickly when first initial does not match
        if nameOne[:1] != nameTwo[:1]:
            return False

        # Nickname processing for name one
        namesOne = []
        for name in nameOne.replace('-', ' ').split(' '):
            if name in nicknames:
                namesOne.append(nicknames[name])
            else:
                namesOne.append(name)
        nameOneModified = ' '.join(namesOne)

        # Nickname processing for name two
        namesTwo = []
        for name in nameTwo.replace('-', ' ').split(' '):
            if name in nicknames:
                namesTwo.append(nicknames[name])
            else:
                namesTwo.append(name)
        nameTwoModified = ' '.join(namesTwo)

        # Test the fixed list of exclusions
        if nameOneModified in exclusions and nameTwoModified == exclusions[nameOneModified]:
            return False
        if nameTwoModified in exclusions and nameOneModified == exclusions[nameTwoModified]:
            return False

        initialsOne = [name[:1].upper() for name in namesOne]
        initialsTwo = [name[:1].upper() for name in namesTwo]
        if '' in initialsOne:
            initialsOne.remove('')
        if '' in initialsTwo:
            initialsTwo.remove('')
        
        # Now attempt the fuzzy name matching using Levenshtein and soundex - crude but relatively effective
        try:
            soundex = Soundex()      
            if len(namesOne) > 1 and len(namesTwo) > 1 and \
                (
                    # Levenshtein distance of 1-3 is only allowed if the first and last name "sound the same"
                    namesOne[0][:1].isalpha() and namesTwo[0][:1].isalpha() and \
                    namesOne[-1][:1].isalpha() and namesTwo[-1][:1].isalpha() and \
                    soundex.phonetics(namesOne[0]) == soundex.phonetics(namesTwo[0]) and \
                    soundex.phonetics(namesOne[-1]) == soundex.phonetics(namesTwo[-1]) and \
                    Levenshtein.distance(nameOneModified, nameTwoModified) < 4
                ) or \
                (
                    # Levenshtein distance of 1 is allowed if the the first and last initials are the same
                    initialsOne[0] == initialsTwo[0] and \
                    initialsOne[-1] == initialsTwo[-1] and \
                    Levenshtein.distance(nameOneModified, nameTwoModified) < 2
                ):
                return True
        except:
            print('Crashed comparing {} and {}'.format(nameOne, nameTwo))
            raise
            
        return False

## Unit Tests

A handful of very basic tests

In [4]:
class TestFuzzyMatch(unittest.TestCase):
    '''Class to test FuzzyMatch class'''
    
    def testTypo01(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Graham Holbert', 'Graham Hulbert'))


    def testTypo02(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Trevor Watford', 'Trevor Whatford'))


    def testTypo03(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Mathew Spoonert', 'Matthew Spooner'))


    def testTypo04(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Kin Newman', 'Kim Newman'))


    def testTypo05(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Piere Saville', 'Pierre Saville'))


    def testTypo06(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Jon Montgomery', 'John Montgomery'))


    def testTypo07(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Alistair Williams', 'Alastair Williams'))


    def testNickname01(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Pete Martin', 'Peter Martin'))


    def testNickname01(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Dave MacInnes', 'David MacInnes'))


    def testNickname02(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Dan Robinson', 'Daniel Robinson'))


    def testNickname03(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Jim Paine', 'James Paine'))


    def testNickname04(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Matt Spooner', 'Matthew Spooner'))


    def testNickname05(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Eddie Murrell', 'Edward Murrell'))


    def testNickname06(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Ed Murrell', 'Edward Murrell'))


    def testNickname07(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Robert Date', 'Bob Date'))


    def testNickname08(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Stephen Davison', 'Steve Davidson'))


    def testNickname09(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Michael Walklin', 'Mike Walklin'))


    def testVariation01(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Hans-Juergen', 'Hans-Jürgen'))


    def testVariation02(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(True, fuzzyMatch.matchNames('Claude van Martyn', 'Claude Van-Martyn'))


    def testVariation03(self):

        # Ideally I'd like this to match but it is at the expense of matching initials
        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Claude Van-Martyn', 'Claude Van-Man-Martyn'))


    def testVariation04(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        # Ideally I'd like this to match but it contains typos and a middle name
        self.assertEqual(False, fuzzyMatch.matchNames('Trevor Lyn Whatford', 'Trevor Whatford'))


    def testNonMatch01(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Alex Montgomery', 'John Montgomery'))


    def testNonMatch02(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Robin Ball', 'Kevin Hall'))


    def testNonMatch03(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Nick Beaney', 'Nick Povey'))


    def testNonMatch04(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Andy', 'Amy'))


    def testNonMatch05(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Richard Jenkins', 'Richard Jones'))


    def testNonMatch06(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Jeremy Waitt', 'Jeremy Walwin'))


    def testNonMatch07(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Dave Ellerman', 'Dave Ellerbeck'))


    def testNonMatch08(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Simon Moore', 'Simon Maguire'))    
        

    def testExclusion01(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Ruben Hofman', 'Robin Hofman'))


    def testExclusion02(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Mike Price', 'Mike Pearce'))


    def testExclusion03(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Mike Pacey', 'Mike Pearce'))


    def testExclusion04(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Mike Price', 'Mike Pacey'))


    def testExclusion05(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Alex Bradley', 'Alex Bailey'))


    def testExclusion06(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Robert Dunn', 'Robert Date'))


    def testExclusion07(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Roger Clark', 'Roger Crabb'))


    def testExclusion09(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Stephen Corps', 'Stephen Cole'))


    def testExclusion10(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Simon Hemsley', 'Simon Hinkley'))


    def testExclusion11(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Richard Powell', 'Richard Peel'))
        

    def testCrash01(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Dave - Vinnie - Standing', 'Dave Vinne Standing'))


    def testCrash02(self):

        fuzzyMatch = FuzzyMatch(verbosity=0)
        self.assertEqual(False, fuzzyMatch.matchNames('Richard Jones (GBR-32)', 'Richard Trubger'))

In [5]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

........................................
----------------------------------------------------------------------
Ran 40 tests in 0.111s

OK


## Run Unit Tests

Note: Only run unit tests when running this script directly, not during an import

## All Done!