In [1]:
import re
import pandas as pd

In [2]:
results = pd.read_csv('matches.csv')

In [3]:
def jaccard_index(string1, string2):
    if not string1 or not string2:
        return 0
    
    intersection = set(string1).intersection(set(string2))
    union = set(string1).union(set(string2))
    return len(intersection) / len(union)

In [4]:
def extract_words(string):
    return re.sub(r'[^a-zA-Z\ ]', ' ', string.lower()).split()

In [5]:
def adjacent_pairs(lst):
    return [' '.join([lst[i], lst[i+1]]) for i in range(len(lst)-1)]

lst = ['a', 'b', 'c']
print(adjacent_pairs(lst))

['a b', 'b c']


In [6]:
results["similarity"] = results.fillna("").apply(
    lambda x: jaccard_index(
        adjacent_pairs(extract_words(x["title"])),
        adjacent_pairs(extract_words(x["match_title"])),
    ),
    axis=1,
)

In [7]:
pd.cut(results.similarity, bins=10).value_counts().sort_index()

similarity
(-0.001, 0.1]    439
(0.1, 0.2]        19
(0.2, 0.3]        16
(0.3, 0.4]        27
(0.4, 0.5]        32
(0.5, 0.6]         5
(0.6, 0.7]         7
(0.7, 0.8]         5
(0.8, 0.9]         1
(0.9, 1.0]       169
Name: count, dtype: int64

In [8]:
results.query('similarity > 0.15').sort_values('similarity').head(30)

Unnamed: 0,id,title,match_title,match_id,similarity
701,698,O God Of Love To Thee We Bow,"O God our Father, we would come to Thee",55.0,0.153846
105,106,Hallelujah For The Cross,The cross it standeth fast,620.0,0.166667
238,239,"Spring Up, O Well","Spring up, well, with water",250.0,0.166667
273,273,"Deeper, Deeper","Deeper, deeper, in the cross of Jesus",1240.0,0.166667
283,283,Consider Him,"Consider Him, let Christ thy pattern be",656.0,0.166667
244,245,Fill Me Now,Fill me with Thy gracious Spirit,267.0,0.166667
584,582,Dear Lord! Precious Lord,"Dear Lord Jesus, precious Jesus",1158.0,0.166667
483,483,Take Thou My Hand,"Hold Thou my hand: so weak I am, and helpless",388.0,0.2
136,137,"Lord, Thou Art God's Anointed","Dear Lord, Thou art the Son of God",188.0,0.2
312,312,I Surrender All,All to Jesus I surrender,441.0,0.2


In [9]:
results['match_url'] = 'https://www.hymnal.net/en/hymn/h/' + results.match_id.astype('Int64').astype('string')

In [10]:
results.sort_values('similarity').to_csv('matches.csv', index=False)