In [2]:
import nltk
from nltk.book import * #text1, text2, text3, text4, text5, text6, text7, text8, text9
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import itertools
import matplotlib.pyplot as plt
import hashlib
import numpy as np

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
def remove(text):

    """
    Function takes as an input string, removes punctuation, stopwords and reduces inflected words to their stems,
    then returns a list of the remaining words.

    Parameters:
            text(String): initial string

    Return:
            cleared(List[String]): a list consisting of every word from input text without punctuation,
                                   stopwords and inflected words reduced to their stems

    """

    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')
    tokens = tokenizer.tokenize(' '.join([token for token in text]))

    nopunct_text = [token for token in tokens if token.isalnum()]

    stop_words = set(stopwords.words('english'))

    no_stopwords = [token for token in nopunct_text if token.lower() not in stop_words]

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in no_stopwords]

    return stemmed_tokens

texts = [remove(txt) for txt in [text1, text2, text3, text4, text5, text6, text7, text8, text9]]

In [4]:
l = 100
n = 1000

rng = np.random.default_rng()
S = ["".join(rng.choice(['0','1'],l).tolist()) for i in range(n)]
min = float('inf')

#print(S[0])

for s1,s2 in itertools.permutations(S, 2):
    con = s1+s2
    hash = int(str(hashlib.sha1(con.encode()).hexdigest()), 16)
    
    if hash < min:
        min = hash
        pair = [s1,s2]

print(min,pair)

# 34200239543057730708897409415012917584018 
# ['1100011101100001001010110010000001110100000010001111011100111101010100010000011001100111001111101111', 
# '1000001100100000111011111000011011100000100000110000011010111111110101110001011001000101100110001111']

#print(str(hashlib.sha1(S[0].encode()).hexdigest()))

1640844019907389903271413642195655937062321 ['0101100000010001100101110011011101111011001100010001011001001000000111001111011111000010111010010111', '1010000010100000001110010101010001110011011010010111110111111001011011101100011101100111110011101001']


In [14]:
S3 = [{word for word in texts[i] if len(word)<8} for i in range(3)]

hashes = [[]]

row_indices = range(len(S3))
combinations = list(itertools.combinations(row_indices, 2))
row_pairs = [(S[i], S[j]) for i, j in combinations]

for j in range(3):
    hashes.append([])
    for i in range(100):
        hash_min = float('inf')
        for element in S3[j]:
            hash = int(str(hashlib.sha1(str(i).encode() + element.encode()).hexdigest()), 16)
            if hash < hash_min:
                hash_min = hash
        hashes[j].append(hash_min)

jaccard = {}

for k, l in combinations:
    jaccard[f"texts {k} and {l}"] = sum([int(hashes[k][i] == hashes[l][i]) for i in range(100)])    

print(jaccard)



{'texts 0 and 1': 29, 'texts 0 and 2': 11, 'texts 1 and 2': 16}
