In [3]:
import pandas as pd

def get_shingles(text, k):
    """
    :param text: a list of words produced by splitting the article 
    :param k: number of words in each shingle
    
    return a set with all possible shingle in the article
    """
    shingles = set()
    for i in range(len(text) - k + 1):
        new_shingle = ""
        for word in text[i:i+k]:
            new_shingle += word
            new_shingle += " "
        shingles.add(new_shingle)
    return shingles

def sim(shingles_a, shingles_b):
    """
    :param shingles_a, shingles_b: a set with all possible shingle in the corresponding article
    
    return the similarity score of the to shingles using the Jaccard Similarity
    """
    numer = 0
    denom = 0
    for item in shingles_a:
        if item in shingles_b:
            print("'{}'".format(item.strip()), end = ', ')
            numer += 1
    denom = len(shingles_a) + len(shingles_b) - numer
    sim = numer/denom
    return sim

def fun(char):
    '''
    filter function, is used to delete all puntuation
    
    '''
    filters = [' ']
    for i in range(97, 123):
        filters.append(chr(i))
    if char in filters:
        return True
    else:
        return False

def splite_words(string):
    '''
    convert string of article to words
    
    input: string (article)
    
    return: list of words
    '''
    data = to_english(string)
    data = data.lower()
    data = ''.join(filter(fun, data))
    data = data.split()
    
    return data

def to_english(string):
    '''
    convert non english characters to english
    
    input: string that may include non english characters
    
    return: normalized english string
    
    '''
    import unicodedata
    return ''.join(c for c in unicodedata.normalize('NFD', string) if unicodedata.category(c) != 'Mn')

def frequency (article):
    '''
    find the frequency of words in an article
    
    input: a string of article
    
    return: dictionary of frequenct
    '''
    
    words = splite_words(article)
    series = pd.Series(words)
    counts = series.value_counts()
    freq = counts / len(series)
    
    return freq.to_dict()


with open('HK.txt', 'r') as file:
    HK1 = file.read()

# # print(len(HK))

# # HK = splite_words(HK)

# freq_HK = frequency(HK)

# print (freq_HK)


with open('HK2.txt', 'r') as file:
    HK2 = file.read()

# # print(len(HK))

# # HK = splite_words(HK)

# freq_HK2 = frequency(HK2)

# print (freq_HK2)


K = 2

HK1_splitted = splite_words(HK1)
HK2_splitted = splite_words(HK2)
shingles_HK1 = get_shingles(HK1_splitted, K)
shingles_HK2 = get_shingles(HK2_splitted, K)


with open('NBA1.txt') as file:
    NBA1 = file.read()
with open('NBA2.txt') as file:
    NBA2 = file.read()
with open('NBA3.txt') as file:
    NBA3 = file.read()
with open('NBA4.txt') as file:
    NBA4 = file.read()
with open('NBA5.txt') as file:
    NBA5 = file.read()
    
NBA1_splitted = splite_words(NBA1)
shingles_NBA1 = get_shingles(NBA1_splitted, K)
NBA2_splitted = splite_words(NBA2)
shingles_NBA2 = get_shingles(NBA2_splitted, K)
NBA3_splitted = splite_words(NBA3)
shingles_NBA3 = get_shingles(NBA3_splitted, K)
NBA4_splitted = splite_words(NBA4)
shingles_NBA4 = get_shingles(NBA4_splitted, K)
NBA5_splitted = splite_words(NBA5)
shingles_NBA5 = get_shingles(NBA5_splitted, K)

print('K =', K)
print('Similarity between:')
print()
print()
print('between politics:')
print('HK1 and HK2:', round(sim(shingles_HK1, shingles_HK2),6))
print()
print()
print('between sports:')
print('NBA1 and NBA2:', round(sim(shingles_NBA1, shingles_NBA2),6))
print()
print('NBA1 and NBA3:', round(sim(shingles_NBA1, shingles_NBA3),6))
print()
print('NBA1 and NBA4:', round(sim(shingles_NBA1, shingles_NBA4),6))
print()
print('NBA1 and NBA5:', round(sim(shingles_NBA1, shingles_NBA5),6))
print()
print('NBA2 and NBA5:', round(sim(shingles_NBA2, shingles_NBA5),6))
print()
print()
print('between one politics and one sports')
print('HK1 and NBA1:', round(sim(shingles_HK1, shingles_NBA1),6))
print()
print('HK2 and NBA1:', round(sim(shingles_HK2, shingles_NBA1),6))

K = 2
Similarity between:


between politics:
'the death', 'fell from', 'hong kong', 'from a', 'over the', 'chief executive', 'protesters in', 'during a', 'police and', 'executive carrie', 'that began', 'death of', 'across hong', 'have been', 'on the', 'in the', 'extradition bill', 'between police', 'who fell', 'and the', HK1 and HK2: 0.025189


between sports:
'mtassociated pressfacebooktwitterfacebook', 'first half', 'into the', 'by in', 'give it', 'a with', 'with a', 'including a', 'to give', 'the denver', 'one of', 'and a', 'with the', 'over the', 'at home', 'the lead', 'seconds left', 'rebounds and', 'have to', 'to help', 'it to', 'loss in', 'said i', 'pair of', 'they were', 'games of', 'the game', 's in', 'pm mtassociated', 'the third', 'after a', 'on the', 'in the', 'in a', 'i thought', 'had points', 'the stretch', 'win over', 'a lot', 'of s', 'a pair', 'the second', 'left to', 'to the', 'hit a', 'from the', 'of the', 'down the', 'the first', 'just seconds', 'we didnt', 'points 