## Data analysis
### For each city, find the most likely city to be also searched for within the same session

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from gensim.similarities.index import AnnoyIndexer
from gensim.models import Word2Vec

### Load dataframe

In [2]:
url = 'city_search.json'
df = pd.read_json(url, orient='columns')

In [3]:
df['cities'].tolist()

[['New York NY, Newark NJ'],
 ['New York NY, Jersey City NJ, Philadelphia PA'],
 ['San Antonio TX'],
 ['Edmonton AB'],
 ['Phoenix AZ, Houston TX'],
 ['San Diego CA'],
 ['Montreal QC, Chicago IL'],
 ['Calgary AB, New York NY'],
 ['Chicago IL, New York NY'],
 ['New York NY'],
 ['Los Angeles CA'],
 ['Phoenix AZ'],
 ['New York NY'],
 ['Toronto ON, Kitchener ON'],
 ['Chicago IL, Vancouver BC'],
 ['Indianapolis IN'],
 ['Toronto ON, Oshawa ON'],
 ['Montreal QC'],
 ['Vancouver BC, Victoria BC, Seattle WA'],
 ['Toronto ON, New York NY'],
 ['Phoenix AZ, Chandler AZ, Mesa AZ'],
 ['New York NY'],
 ['Montreal QC'],
 ['San Jose CA'],
 ['Los Angeles CA'],
 ['San Antonio TX, Corpus Christi TX, Arlington TX, Fort Worth TX'],
 ['San Antonio TX, Montreal QC'],
 ['New York NY'],
 ['Calgary AB, Seattle WA, Victoria BC, Portland OR'],
 ['Dallas TX, Arlington TX'],
 ['New York NY'],
 ['Phoenix AZ'],
 ['San Diego CA, Santa Ana CA'],
 ['Phoenix AZ, Montreal QC'],
 ['New York NY'],
 ['Vancouver BC'],
 ['San Jos


[['New York NY, Newark NJ'],
 ['New York NY, Jersey City NJ, Philadelphia PA'],
 ['San Antonio TX'],
 ['Edmonton AB'],
 ['Phoenix AZ, Houston TX'],
 ['San Diego CA'],
 ['Montreal QC, Chicago IL'],
 ['Calgary AB, New York NY'],
 ['Chicago IL, New York NY'],
 ['New York NY'],
 ['Los Angeles CA'],
 ['Phoenix AZ']]



# Create a city recommender

In [4]:
# we'll use only cities column

cities = df['cities'].tolist()

# Create index using gensim.similarities.index 
# Fast Approximate Nearest Neighbor Similarity with Annoy package
# gensim.similarities.index.AnnoyIndexer(model=None, num_trees=None)
# Allows to use Annoy as indexer for most_similar method from Word2Vec, Doc2Vec, FastText and Word2VecKeyedVectors classes.

# Parameters:	
# model (BaseWordEmbeddingsModel, optional) – Model, that will be used as source for index.
# num_trees (int, optional) – Number of trees for Annoy indexer.


model = Word2Vec(cities, min_count=1, seed=1)
indexer = AnnoyIndexer(model, 2)

## Find  similar cities

In [5]:
 # from model.wv.most_similar docstring:
'''
    Find the top-N most similar words.
    Positive words contribute positively towards the similarity, negative words negatively.

    This method computes cosine similarity between a simple mean of the projection
    weight vectors of the given words and the vectors for each word in the model.
    The method corresponds to the `word-analogy` and `distance` scripts in the original
    word2vec implementation.

    Parameters
    ----------
    positive : list of str, optional
        List of words that contribute positively.
    negative : list of str, optional
        List of words that contribute negatively.
    topn : int, optional
        Number of top-N similar words to return.
    restrict_vocab : int, optional
        Optional integer which limits the range of vectors which
        are searched for most-similar values. For example, restrict_vocab=10000 would
        only check the first 10000 word vectors in the vocabulary order. (This may be
        meaningful if you've sorted the vocabulary by descending frequency.)

    Returns
    -------
    list of (str, float)
    Sequence of (word, similarity).
'''

print("Simiar to: 'Montreal QC':\n", model.wv.most_similar('Montreal QC', topn=4, indexer=indexer))

Simiar to: 'Montreal QC':
 [('Montreal QC', 1.0), ('Phoenix AZ, Tucson AZ, Mesa AZ, Scottsdale AZ', 0.4031992554664612), ('San Antonio TX, Corpus Christi TX, Austin TX, Arlington TX, Fort Worth TX', 0.38571810722351074), ('New York NY, Jersey City NJ, Baltimore MD, Newark NJ', 0.3831257224082947)]


In [6]:
print("Simiar to: 'New York NY':\n", model.wv.most_similar('New York NY', topn=4, indexer=indexer))


Simiar to: 'New York NY':
 [('New York NY', 1.0), ('Houston TX, Toronto ON, New York NY', 0.41360849142074585), ('Toronto ON, Oshawa ON, Buffalo NY, Kitchener ON, Hamilton ON', 0.37869954109191895), ('Edmonton AB, Victoria BC, Calgary AB', 0.3742130994796753)]


In [7]:
print("Simiar to: ['Chicago IL', 'New York NY']:\n", model.wv.most_similar(['Chicago IL', 'New York NY'], topn=4, indexer=indexer))


Simiar to: ['Chicago IL', 'New York NY']:
 [('New York NY, Baltimore MD, Philadelphia PA, Jersey City NJ, Newark NJ', 0.38334012031555176), ('OTTAWA ON, Toronto ON, New York NY', 0.3801229000091553), ('Chicago IL, Madison WI, Saint Paul MN, Minneapolis MN', 0.37562012672424316), ('New York NY, Chicago IL, Houston TX', 0.37263667583465576)]


In [8]:
print("Simiar to: 'Edmonton AB':\n", model.wv.most_similar('Edmonton AB', topn=4, indexer=indexer))

Simiar to: 'Edmonton AB':
 [('Edmonton AB', 1.0), ('New York NY, Baltimore MD, Jersey City NJ', 0.3887163996696472), ('New York NY, Baltimore MD, Jersey City NJ, Boston MA', 0.3694309592247009), ('Vancouver BC, Montreal QC', 0.3555435538291931)]


In [9]:
print("Simiar to: 'Jacksonville FL':\n", model.wv.most_similar('Jacksonville FL', topn=4, indexer=indexer))

Simiar to: 'Jacksonville FL':
 [('Jacksonville FL', 1.0), ('San Jose CA, Los Angeles CA, Montreal QC', 0.3842924237251282), ('Edmonton AB, Toronto ON', 0.3761439919471741), ('Houston TX, Austin TX, Arlington TX, Plano TX', 0.372295081615448)]


In [10]:
print("Simiar to: ['Chicago IL', 'Los Angeles CA]:\n", model.wv.most_similar(['Los Angeles CA', 'Chicago IL'], topn=4, indexer=indexer))


Simiar to: ['Chicago IL', 'Los Angeles CA]:
 [('Phoenix AZ, Toronto ON', 0.42807573080062866), ('New York NY, Toronto ON, Houston TX, Chicago IL', 0.39094287157058716), ('Toronto ON, Hamilton ON, Oshawa ON', 0.379960834980011), ('Toronto ON, Oshawa ON, Saint Catharines-Niagara ON', 0.3754507899284363)]
