In [3]:
import pickle
import numpy as np

## 讀取清心全品項的資料

In [4]:
with open('./drinks.pkl', 'rb') as f:
    drink_list = pickle.load(f)

In [8]:
drink_list[:10]

['原鄉四季',
 '特級綠茶',
 '烏龍綠茶',
 '翡翠烏龍',
 '極品菁茶',
 '錫蘭紅茶',
 '特選普洱',
 '奶茶系列',
 '錫蘭奶紅',
 '特級奶綠']

## 使用 levenshtein distance 計算相似度
可參考 https://rust-algo.club/levenshtein_distance/

In [14]:
def levenshtein_ratio_and_distance(s, t):
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k
 
    for col in range(1, cols):
        for row in range(1, rows):
            # case: character are the same
            if s[row-1] == t[col-1]:
                cost = 0 
            else:
                cost = 2
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of delete
                                 distance[row][col-1] + 1,          # Cost of insert
                                 distance[row-1][col-1] + cost)     # Cost of substitution

    ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
    return ratio

## 根據相似度來排序結果，回傳最相近的，如果沒有品項相似比例超過0.5，則回傳簡稱

In [15]:
def fuzzyfinder(user_input, collection):
    suggestions = []
    
    for item in collection:
        ratio = levenshtein_ratio_and_distance(user_input,item)
        if ratio > 0.5:
            suggestions.append((ratio, item))
    res = [x for _, x in sorted(suggestions)]
    try:
        return res[-1]
    except:
        return user_input

## 測試

In [22]:
inputs = ["普洱", "原鄉", "紅茶","烏奶", "果醋"]

In [23]:
for item in inputs:
    print(fuzzyfinder(item, drink_list))

特選普洱
原鄉四季
錫蘭紅茶
烏龍奶茶
蘋果醋
