# Similarity of descriptions (reviews)
## Following W4
* Shingles - k-shingle is any substring of length k found within the document.
* Minhashing - Hashing each shingle to a number and keeping the minimum of the hash values.
* Signatures - Minhashing for multiple hash functions and keeping the minhash values for each hash function in a list.
* Jaccard similarity - The fraction of the number of elements in the intersection of two sets and the number of elements in the union of the two sets. Approximated by counting number of minhash values in the signature lists that match and dividing by the number of hash functions.
* Locality-Sensitive Hashing (LSH) - Dividing the signature matrix into bands and hashing the bands. If two signatures are similar, they will hash to the same bucket with high probability.



In [237]:
import csv
import mmh3
import pandas as pd
from tqdm import tqdm
import random

In [238]:
# Load descriptions from data/movies_metrics.csv
Movies = pd.read_csv('data/movie_titles_and_ids.csv')
Movies.head(2)

Unnamed: 0,Name,ID,Year,Length,Age,Rating,Description
0,The Shawshank Redemption,tt0111161,1994,2h 22m,15,9.3 (3M),A banker convicted of uxoricide forms a friend...
1,The Godfather,tt0068646,1972,2h 55m,15,9.2 (2.1M),The aging patriarch of an organized crime dyna...


In [239]:
descriptions = Movies['Description']

### Preprocessing (Nojan)

In [240]:
from bs4 import BeautifulSoup
import nltk
import re 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [241]:

def simple_preprocess_text(corpus):
    # Remove HTML tags
    corpus = [BeautifulSoup(text, "html.parser").get_text() for text in corpus]

    # Remove urls
    corpus = [re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) for text in corpus]
    
    # Remove non-alphabetic characters
    corpus = [re.sub("[^a-zA-Z]", " ", text) for text in corpus]

    # Convert to lowercase
    corpus = [text.lower() for text in corpus]

    return corpus

def preprocessing_lemmatization(corpus):
    # Tokenize the text (split it into words)
    corpus = [word_tokenize(text) for text in corpus]

    # Remove stop words (The, a, on, etc)
    stop_words = set(stopwords.words("english"))
    corpus = [[word for word in text if word not in stop_words] for text in corpus]

    # Lemmatization,
    # AKA remove word endings to get the base form
    lemmatizer = nltk.stem.WordNetLemmatizer()
    corpus = [[lemmatizer.lemmatize(word) for word in text] for text in corpus]

    # Join the words back into one string
    corpus = [" ".join(text) for text in corpus]

    return corpus

descriptions = simple_preprocess_text(descriptions)
descriptions = preprocessing_lemmatization(descriptions)

### Shingle

In [242]:
def shingle(s, q):
    return {s[i:i+q] for i in range(len(s) - q + 1)}

### Minhashing and Signature

In [243]:
import hashlib
def listhash(shingle, seed):
	return int(hashlib.sha256((shingle + str(seed)).encode('utf-8')).hexdigest(), 16) % 2**32-1

def minhash(shingles, seed):
    return min(listhash(s, seed) for s in shingles)

def signature(shingles, k):
	return [minhash(shingles, seed) for seed in range(k)]

### Signature for entire dataset

In [244]:
q = 5 # length of shingle  (5 ok for emails)
k = 100 # number of minhashes
keys = [f'{name} ({year})' for name, year in zip(Movies['Name'], Movies['Year'])]
# docs = Movies.set_index('Name')['Description'].to_dict() # dictionary mapping document id to document contents
docs = {key : desc for key, desc in zip(keys, descriptions)}

Observing that there will be an intersection of shingles in documents, this can be used to define the signatures for the entire dataset. Instead of iterating over documents and defining signatures individually, iterate over shingles

In [245]:
def MakeSignatures(docs, shingles, U):
    SIG = {doc : [float('inf') for _ in range(k)] for doc in docs}

    for s in tqdm(U):
        # hashes = signature(s, k)
        hashes = [listhash(s, seed) for seed in range(k)]
        for doc in docs:
            if s in shingles[doc]:
                for i in range(k):
                    SIG[doc][i] = min(hashes[i], SIG[doc][i])
    return SIG

shingles = {doc: shingle(docs[doc], q) for doc in docs}
U = set([s for doc in shingles for s in shingles[doc]])  # All shingles

signatures = MakeSignatures(docs, shingles, U)

  0%|          | 0/9105 [00:00<?, ?it/s]

100%|██████████| 9105/9105 [00:05<00:00, 1544.59it/s]


### Jaccard similarity

In [246]:
def jaccard(S, T):
    intersection = len(S & T)
    union = len(S | T)
    return intersection / union

# Approximate Jaccard similarity using minhash signatures
def approximate_jaccard(A, B, signatures):
    return sum(i == j for i, j in zip(signatures[A], signatures[B])) / k

In [None]:
# Test
A, B = random.sample(keys, 2)
print(A, '-', B)

doc1 = docs[A]
doc2 = docs[B]
shingle1 = shingle(doc1, q)
shingle2 = shingle(doc2, q)
signature1 = signature(shingle1, k)
signature2 = signature(shingle2, k)

# Exact Jaccard similarity
print(jaccard(shingle1, shingle2))

# Approximate Jaccard similarity
print(approximate_jaccard(A, B, signatures))

Shrek Forever After (2010) - Prince of Persia: The Sands of Time (2010)
0.012145748987854251
0.01


In [248]:
# Find most similar by description
mx = 0.0
most_similar = None
for i, A in enumerate(keys):
    for j in range(i+1, len(keys)):
        B = keys[j]
        # shingle1 = shingle(docs[A], q)
        # shingle2 = shingle(docs[B], q)
        # sim = jaccard(shingle1, shingle2)
        sim = approximate_jaccard(A, B, signatures)
        if sim > mx:
            mx = sim
            most_similar = (A, B)
print(mx, most_similar)
# Jaccard 

0.15 ('Alice in Wonderland (2010)', 'Mystic River (2003)')


In [249]:
A, B = random.sample(keys, 2)
print(A, '-', B)
# Exact Jaccard similarity
print(jaccard(shingle1, shingle2))

# Approximate Jaccard similarity
matches = sum(1 for i, j in zip(signature1, signature2) if i == j)
print(matches)
print(matches / k)
# jaccard(docs[A], docs[B]), approximate_jaccard(A, B, signatures)

Harry Potter and the Half-Blood Prince (2009) - Prince of Persia: The Sands of Time (2010)
1.0
100
1.0


### Locality-Sensitive Hashing
Pick $b$ and $r$ such that $k = br$ and $(1/b)^{1/r} \approx s$, where $s$ is the similarity threshold.

In [250]:
(1/b)**(1/r)

0.1414213562373095

In [254]:
def fill_buckets(signatures, b, r):
    buckets = {i: {} for i in range(b)}
    for doc in signatures:
        for i in range(b):
            h = hash(tuple(signatures[doc][i*r:(i+1)*r]))
            if h not in buckets[i]:
                buckets[i][h] = []
            buckets[i][h].append(doc)
    return buckets

def lsh(signatures, threshold):
    b = 50
    r = k // b
    buckets = fill_buckets(signatures, b, r)

    candidates = set()
    for bucket in buckets:
        for h in buckets[bucket]:
            if len(buckets[bucket][h]) > 1:
                for doc in buckets[bucket][h]:
                    candidates.add(doc)
    candidates = list(candidates)
    
    # Now compare all pairs of candidates
    similar = []
    for i, A in enumerate(candidates):
        for j in range(i+1, len(candidates)):
            B = candidates[j]
            sim = approximate_jaccard(A, B, signatures)
            if sim > threshold:
                similar.append((A, B, sim))
    return similar

lsh(signatures, 0.1)

[('Diary of the Dead (2007)', 'Dead Snow (2009)', 0.11),
 ('The Sixth Sense (1999)', 'Good Will Hunting (1997)', 0.12),
 ('Resident Evil: Apocalypse (2004)',
  'Resident Evil: Extinction (2007)',
  0.14)]