In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import pickle
import nltk
import time
import warnings
import re
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('twitter_samples')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /Users/vuhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/vuhan/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


# Data Preparation

In [2]:
all_pos_tweets = twitter_samples.strings('positive_tweets.json')
all_neg_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = all_pos_tweets + all_neg_tweets

In [3]:
en_embeddings_subset = pickle.load(open("./data/en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("./data/fr_embeddings.p", "rb"))

In [24]:
print(f"Number of English words in dictionary: {len(en_embeddings_subset)}")
print(f"Number of French words in dictionary: {len(fr_embeddings_subset)}")

print(f"Dimensions of each words in dictionary: {len(list(en_embeddings_subset.values())[0])}")

Number of English words in dictionary: 6370
Number of French words in dictionary: 5766
Dimensions of each words in dictionary: 300


In [5]:
def get_dict(file_name):
    """
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
    Check out the files this function takes in your workspace.
    """
    my_file = pd.read_csv(file_name, delimiter=' ')
    etof = {}  # the english to french dictionary to be returned
    for i in range(len(my_file)):
        # indexing into the rows.
        en = my_file.loc[i][0]
        fr = my_file.loc[i][1]
        etof[en] = fr

    return etof

In [6]:
# loading the english to french dictionaries
en_fr_train = get_dict('./data/en-fr.train.txt')
print('The length of the English to French training dictionary is', len(en_fr_train))
en_fr_test = get_dict('./data/en-fr.test.txt')
print('The length of the English to French test dictionary is', len(en_fr_test))

The length of the English to French training dictionary is 5000
The length of the English to French test dictionary is 1500


In [7]:
def process_tweets(tweet, stem=True):
    
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    stopwords_eng = stopwords.words('english')
    tweet_clean = []
    for word in tweet_tokens:
        if word not in stopwords_eng and word not in string.punctuation:
            tweet_clean.append(word)
    if stem==False:
        return tweet_clean
    else:    
        tweet_stem = []
        stemmer = PorterStemmer()
        for word in tweet_clean:
            stem_word = stemmer.stem(word)
            tweet_stem.append(stem_word)
        return tweet_stem

# Seaching with LSH

In [32]:
vec_dims = 300

In [34]:
def get_doc_embedding(tweet, en_embeddings, process_tweet=process_tweets):
    '''
    Input: 
        - tweet: string
        - en_embedddings: dictionary of words embeddings
        Ouput:
        - doc_embedding: sum of all word embeddings in the input tweet
    '''
    doc_embedding = np.zeros(vec_dims)
    doc_processed = process_tweet(tweet)
    for word in doc_processed:
        doc_embedding += en_embeddings.get(word, 0)
    return doc_embedding

In [37]:
example_tweet = 'Document embedding is created by summing up the embeddings of all words in the document.'
example_embedding = get_doc_embedding(example_tweet, en_embeddings_subset)

example_embedding[:10]

array([ 0.83251953,  0.13134766,  0.50927734,  0.10461426,  0.10864258,
        0.70581055,  0.13476562,  0.08300781,  0.77539062, -0.16485596])

In [39]:
example_embedding[-5:]

array([-0.05480957, -0.0168457 , -0.62866211, -0.34863281, -0.14282227])

In [42]:
def get_doc_vecs(all_docs, en_embeddings, get_doc_embedding=get_doc_embedding):
    '''
    Input:
        - all_docs: list of strings - all tweets in dataset.
        - en_embeddings: dictionary with words as the keys and their embeddings as the values.
    Output:
        - matrix_doc_vec: matrix of tweet embeddings.
        - idxdoc_dict: dictionary with indices of tweets in vecs as keys and their embeddings as the values.
    '''
    
    idxdoc_dict = {}
    list_doc_vec = []
    
    for i, doc in enumerate(all_docs):
        doc_embedding = get_doc_embedding(doc, en_embeddings, process_tweet=process_tweets)
        idxdoc_dict[i] = doc_embedding
        list_doc_vec.append(doc_embedding)
    
    matrix_doc_vec = np.vstack(list_doc_vec)
    
    return matrix_doc_vec, idxdoc_dict

In [43]:
doc_vecs, idx_tweet_dict = get_doc_vecs(all_tweets, en_embeddings_subset)

In [45]:
print(f'Length of dictionary: {len(idx_tweet_dict)}')
print(f'Shape of vector documents: {doc_vecs.shape}')

Length of dictionary: 10000
Shape of vector documents: (10000, 300)


#### Choosing the number of planes

* Each plane divides the space to $2$ parts.
* So $n$ planes divide the space into $2^{n}$ hash buckets.
* We want to organize 10,000 document vectors into buckets so that every bucket has about $~16$ vectors.
* For that we need $\frac{10000}{16}=625$ buckets.
* We're interested in $n$, number of planes, so that $2^{n}= 625$. Now, we can calculate $n=\log_{2}625 = 9.29 \approx 10$.

<a name="3-4"></a>
### Getting the Hash Number for a Vector

For each vector, we need to get a unique number associated to that vector in order to assign it to a "hash bucket".

#### Hyperplanes in Vector Spaces
* In $3$-dimensional vector space, the hyperplane is a regular plane. In $2$ dimensional vector space, the hyperplane is a line.
* Generally, the hyperplane is subspace which has dimension $1$ lower than the original vector space has.
* A hyperplane is uniquely defined by its normal vector.
* Normal vector $n$ of the plane $\pi$ is the vector to which all vectors in the plane $\pi$ are orthogonal (perpendicular in $3$ dimensional case).

#### Using Hyperplanes to Split the Vector Space
We can use a hyperplane to split the vector space into $2$ parts.
* All vectors whose dot product with a plane's normal vector is positive are on one side of the plane.
* All vectors whose dot product with the plane's normal vector is negative are on the other side of the plane.

#### Encoding Hash Buckets
* For a vector, we can take its dot product with all the planes, then encode this information to assign the vector to a single hash bucket.
* When the vector is pointing to the opposite side of the hyperplane than normal, encode it by 0.
* Otherwise, if the vector is on the same side as the normal vector, encode it by 1.
* If you calculate the dot product with each plane in the same order for every vector, you've encoded each vector's unique hash ID as a binary number, like [0, 1, 1, ... 0].

<a name="ex-9"></a>
### Function `hash_value_of_vector`

It is list of `N_UNIVERSES` matrices, each describes its own hash table. Each matrix has `N_DIMS` rows and `N_PLANES` columns. Every column of that matrix is a `N_DIMS`-dimensional normal vector for each of `N_PLANES` hyperplanes which are used for creating buckets of the particular hash table.


* First multiply your vector `v`, with a corresponding plane. This will give you a vector of dimension $(1,\text{N_planes})$.
* You will then convert every element in that vector to 0 or 1.
* You create a hash vector by doing the following: if the element is negative, it becomes a 0, otherwise you change it to a 1.
* You then compute the unique number for the vector by iterating over `N_PLANES`
* Then you multiply $2^i$ times the corresponding bit (0 or 1).
* You will then store that sum in the variable `hash_value`.


$$ hash = \sum_{i=0}^{N-1} \left( 2^{i} \times h_{i} \right) $$

In [50]:
n_planes = 10
n_dims = vec_dims # 300
n_universes = 25  # Number of times to repeat the hashing to improve the search.

In [51]:
np.random.seed(42)

list_planes = [np.random.normal(size=(n_dims, n_planes)) for uni in range(n_universes)]

In [59]:
print(len(list_planes))

list_planes[0].shape

25


(300, 10)

In [56]:
def hash_value_vector(v, planes):
    """Create a hash for a vector; hash_id says which random hash to use.
    Input:
        - v:  vector of tweet. It's dimension is (1, N_DIMS)
        - planes: matrix of dimension (N_DIMS, N_PLANES) - the set of planes that divide up the region
    Output:
        - res: a number which is used as a hash for our vector

    """
    dot_product = np.dot(v, planes)
    sign_of_dot = np.sign(dot_product)
    
    h = np.array([[1 if x >= 0 else 0 for x in np.squeeze(sign_of_dot)]])
    h = np.squeeze(h)
    
    hash_value = 0
    
    n_planes = planes.shape[1]
    for i in range(n_planes):
        hash_value += (2**i * h[i])
        hash_value = int(hash_value)
        
    return hash_value

In [58]:
np.random.seed(42)

exp_vec = np.random.rand(1, 300)
exp_plane = list_planes[0]

exp_hash_val = hash_value_vector(exp_vec, exp_plane)

print(f'The hash value of this vector and the set of planes ast index 0 is {exp_hash_val}')

The hash value of this vector and the set of planes ast index 0 is 951


In [64]:

exp_vec = example_embedding
exp_plane = list_planes[1]

exp_hash_val = hash_value_vector(exp_vec, exp_plane)

print(f'The hash value of this vector and the set of planes ast index 0 is {exp_hash_val}')

The hash value of this vector and the set of planes ast index 0 is 691


<a name="3-5"></a>
### Creating a Hash Table

<a name="ex-10"></a>
### Function `make_hash_table`

Given that you have a unique number for each vector (or tweet), You now want to create a hash table. You need a hash table, so that given a hash_id, you can quickly look up the corresponding vectors. This allows you to reduce your search by a significant amount of time.


`make_hash_table` function, which maps the tweet vectors to a bucket and stores the vector there. It returns the `hash_table` and the `id_table`. The `id_table` allows to know which vector in a certain bucket corresponds to what tweet.

In [65]:
def make_hash_table(vecs, planes, hash_value_vector=hash_value_vector):
    """
    Input:
        - vecs: list of vectors to be hashed.
        - planes: the matrix of planes in a single "universe", with shape (embedding dimensions, number of planes).
    Output:
        - hash_table: dictionary - keys are hashes, values are lists of vectors (hash buckets)
        - id_table: dictionary - keys are hashes, values are list of vectors id's
                            (it's used to know which tweet corresponds to the hashed vector)
    """
    
    num_planes = planes.shape[1]
    num_buckets = 2**num_planes
    
    hash_table = {i: [] for i in range(num_buckets)}
    id_table = {i: [] for i in range(num_buckets)}
    
    for i, v in enumerate(vecs):
        h = hash_value_vector(v, planes)
        
        hash_table[h].append(v) # store the vector into hash_table at key h,
                                # by appending the vector v to the list at key h
        
        id_table[h].append(i) # store the vector's index 'i' (each document is given a unique integer 0,1,2...)
                              # the key is the h, and the 'i' is appended to the list at key h
            
    return hash_table, id_table

In [67]:
exp_hash_table, exp_id_table = make_hash_table(doc_vecs, exp_plane, hash_value_vector=hash_value_vector)

print('The hash table at key 0 has {} document vectors'.format(len(exp_hash_table[0])))
print('The id table at key 0 has {} document indices'.format(len(exp_id_table[0])))
print('The first 5 document indices stored at key 0 of id table: {}'.format(exp_id_table[0][:5]))

The hash table at key 0 has 6 document vectors
The id table at key 0 has 6 document indices
The first 5 document indices stored at key 0 of id table: [945, 1848, 3421, 6462, 9513]


### Creating all Hash Tables

In [81]:
def create_hash_id_tables(universes):
    hash_tables = []
    id_tables = []
    for id_universe in range(universes):
        print('Working on universe: {}'.format(id_universe))
        planes = list_planes[id_universe]
        hash_table, id_table = make_hash_table(doc_vecs, planes)
        
        hash_tables.append(hash_table)
        id_tables.append(id_table)
    return hash_tables, id_tables

In [82]:
hash_tables, id_tables = create_hash_id_tables(n_universes)

Working on universe: 0
Working on universe: 1
Working on universe: 2
Working on universe: 3
Working on universe: 4
Working on universe: 5
Working on universe: 6
Working on universe: 7
Working on universe: 8
Working on universe: 9
Working on universe: 10
Working on universe: 11
Working on universe: 12
Working on universe: 13
Working on universe: 14
Working on universe: 15
Working on universe: 16
Working on universe: 17
Working on universe: 18
Working on universe: 19
Working on universe: 20
Working on universe: 21
Working on universe: 22
Working on universe: 23
Working on universe: 24


In [83]:
def cosine_similarity(A, B):
    '''
    Input:
        A: a numpy array which corresponds to a word vector
        B: A numpy array which corresponds to a word vector
    Output:
        cos: numerical number representing the cosine similarity between A and B.
    '''
    cos = -10    
    dot = np.dot(A, B)
    normb = np.linalg.norm(B)
    
    if len(A.shape) == 1: # If A is just a vector, we get the norm
        norma = np.linalg.norm(A)
        cos = dot / (norma * normb)
    else: # If A is a matrix, then compute the norms of the word vectors of the matrix (norm of each row)
        norma = np.linalg.norm(A, axis=1)
        epsilon = 1.0e-9 # to avoid division by 0
        cos = dot / (norma * normb + epsilon)
        
    return cos

In [84]:
def nearest_neighbor(v, candidates, k=1, cosine_similarity=cosine_similarity):
    """
    Input:
      - v, the vector are going find the nearest neighbor for
      - candidates: a set of vectors where we will find the neighbors
      - k: top k nearest neighbors to find
    Output:
      - k_idx: the indices of the top k closest vectors in sorted form
    """
    similarity_l = []

    # for each candidate vector...
    for row in candidates:
        # get the cosine similarity
        cos_similarity = cosine_similarity(v, row)

        # append the similarity to the list
        similarity_l.append(cos_similarity)

    # sort the similarity list and get the indices of the sorted list    
    sorted_ids = np.argsort(similarity_l)
    
    # Reverse the order of the sorted_ids array
    sorted_ids = sorted_ids[::-1]
    
    # get the indices of the k most similar candidate vectors
    k_idx = sorted_ids[:k]

    return k_idx

## Approximate K-NN

<a name="ex-11"></a>
## Function `approximate_knn`

Implement approximate K nearest neighbors using locality sensitive hashing,
to search for documents that are similar to a given document at the
index `doc_id`.

##### Inputs
* `doc_id` is the index into the document list `all_tweets`.
* `v` is the document vector for the tweet in `all_tweets` at index `doc_id`.
* `list_planes` is the list of planes (the global variable created earlier).
* `k` is the number of nearest neighbors to search for.
* `num_universes_to_use`: to save time, we can use fewer than the total
number of available universes.  By default, it's set to `N_UNIVERSES`,
which is $25$ for this notebook.
* `hash_tables`: list with hash tables for each universe.
* `id_tables`: list with id tables for each universe.

The `approximate_knn` function finds a subset of candidate vectors that
are in the same "hash bucket" as the input vector 'v'.  Then it performs
the usual k-nearest neighbors search on this subset (instead of searching
through all 10,000 tweets).

In [91]:
def approximate_knn(doc_id, v, list_planes, hash_tables, id_tables, k=1, num_universes_to_use=25, hash_value_vector=hash_value_vector):
    
    list_vecs_to_consider = list()
    list_ids_to_consider = list()
    set_ids_to_consider = set()
    
    for id_universe in range(num_universes_to_use):
        
        planes = list_planes[id_universe]
        hash_value = hash_value_vector(v, planes)
        
        hash_table = hash_tables[id_universe]
        list_document_vectors = hash_table[hash_value]
        
        id_table = id_tables[id_universe]
        new_ids_consider = id_table[hash_value]
        
        for i, new_id in enumerate(new_ids_consider):
            if doc_id == new_id:
                continue
                
            if new_id not in set_ids_to_consider:
                document_vector_at_i = list_document_vectors[i]
                list_vecs_to_consider.append(document_vector_at_i)
                list_ids_to_consider.append(new_id)
                set_ids_to_consider.add(new_id)
                
    print('Fast considering {} vectors'.format(len(list_vecs_to_consider)))

    arr_vecs_to_consider = np.array(list_vecs_to_consider)
    list_nearest_neighbors_idx = nearest_neighbor(v, arr_vecs_to_consider, k=k)

    nearest_neighbor_ids = [list_ids_to_consider[idx] for idx in list_nearest_neighbors_idx]
        
    return nearest_neighbor_ids


In [103]:
doc_id = 1
doc_to_search = all_tweets[doc_id]
vec_to_search = doc_vecs[doc_id]

In [104]:
nearest_neighbor_ids = approximate_knn(doc_id, vec_to_search, list_planes, 
                                       hash_tables, id_tables, k=3,
                                       num_universes_to_use=5)

Fast considering 266 vectors


In [105]:
print('** Nearest neighbors for document: {}'.format(doc_id))
print('\n** Document contents: \n{}'.format(doc_to_search))

for neighbor_id in nearest_neighbor_ids:
    print('\n** Nearest neighbors at document id: {}'.format(neighbor_id))
    print('** Document content: {}'.format(all_tweets[neighbor_id]))

** Nearest neighbors for document: 1

** Document contents: 
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!

** Nearest neighbors at document id: 1087
** Document content: @bwoyblunder @rajudasonline Sorted :). Thanks. Daaru party in my chaddi, bros.

** Nearest neighbors at document id: 1857
** Document content: @jamestheeight Hey James, thanks for the tweet. Not currently, no :). Let us know if we can help with anything else. -AL

** Nearest neighbors at document id: 4366
** Document content: @Sukihaikal hahaha okay thank you :)
