In [20]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

def merge_data(data1, data2, threshold=0.5):
    merged_data = {}
    
    for key in ["chinese_name", "english_name"]:
        words1 = [item["word"] for item in data1[key]]
        words2 = [item["word"] for item in data2[key]]

        sim = jaccard_similarity(words1, words2)
        
        if sim > threshold:
            merged_words = merge_phrases(' '.join(words1), ' '.join(words2)).split()
        else:
            merged_words = words1 + words2
        
        merged_data[key] = [{"word": word} for word in merged_words]

    return merged_data

def merge_phrases(phrase1, phrase2):
    tokens1 = phrase1.split()
    tokens2 = phrase2.split()

    result = []
    i = 0

    for token in tokens1:
        if token in tokens2:
            while i < len(tokens2) and tokens2[i] != token:
                result.append(tokens2[i])
                i += 1
            if i < len(tokens2) and tokens2[i] == token:
                i += 1
        result.append(token)

    while i < len(tokens2):
        result.append(tokens2[i])
        i += 1

    return ' '.join(result)

data3 = {
    "chinese_name": [{"word": "蓝"},
                     {"word": "炸"},
                     {"word": "鸡"}],
    "english_name": [{"word": "Blue"},
                     {"word": "Fried"},
                     {"word": "Chicken"}]
}

data4 = {
    "chinese_name": [{"word": "蓝"},
                    {"word": "蛙"},
                     {"word": "炸"}],
    "english_name": [{"word": "Blue"},
                     {"word": "Frog"},
                     {"word": "Fried"}]
}

merged_data = merge_data(data3, data4, threshold=0.3)
print(merged_data)


{'chinese_name': [{'word': '蓝'}, {'word': '蛙'}, {'word': '炸'}, {'word': '鸡'}], 'english_name': [{'word': 'Blue'}, {'word': 'Frog'}, {'word': 'Fried'}, {'word': 'Chicken'}]}
