## Data analysis
### For each city, find the most likely city to be also searched for within the same session

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from gensim.similarities.index import AnnoyIndexer
from gensim.models import Word2Vec

### Load dataframe

In [2]:
url = 'city_search.json'
df = pd.read_json(url, orient='columns')

In [3]:
items = df['cities'].tolist()
print(df['cities'].tolist()[:12])

[['New York NY, Newark NJ'], ['New York NY, Jersey City NJ, Philadelphia PA'], ['San Antonio TX'], ['Edmonton AB'], ['Phoenix AZ, Houston TX'], ['San Diego CA'], ['Montreal QC, Chicago IL'], ['Calgary AB, New York NY'], ['Chicago IL, New York NY'], ['New York NY'], ['Los Angeles CA'], ['Phoenix AZ']]


In [4]:
len(df['cities'].tolist())

20022


# Create a city recommender

In [5]:
# we'll use only cities column

cities = df['cities'].tolist()

# Create index using gensim.similarities.index 
# Fast Approximate Nearest Neighbor Similarity with Annoy package
# gensim.similarities.index.AnnoyIndexer(model=None, num_trees=None)
# Allows to use Annoy as indexer for most_similar method from Word2Vec, Doc2Vec, FastText and Word2VecKeyedVectors classes.

# Parameters:	
# model (BaseWordEmbeddingsModel, optional) – Model, that will be used as source for index.
# num_trees (int, optional) – Number of trees for Annoy indexer.


model = Word2Vec(cities, min_count=1, seed=1)
indexer = AnnoyIndexer(model, 2)

## Find  similar cities

In [6]:
 # from model.wv.most_similar docstring:
'''
    Find the top-N most similar words.
    Positive words contribute positively towards the similarity, negative words negatively.

    This method computes cosine similarity between a simple mean of the projection
    weight vectors of the given words and the vectors for each word in the model.
    The method corresponds to the `word-analogy` and `distance` scripts in the original
    word2vec implementation.

    Parameters
    ----------
    positive : list of str, optional
        List of words that contribute positively.
    negative : list of str, optional
        List of words that contribute negatively.
    topn : int, optional
        Number of top-N similar words to return.
    restrict_vocab : int, optional
        Optional integer which limits the range of vectors which
        are searched for most-similar values. For example, restrict_vocab=10000 would
        only check the first 10000 word vectors in the vocabulary order. (This may be
        meaningful if you've sorted the vocabulary by descending frequency.)

    Returns
    -------
    list of (str, float)
    Sequence of (word, similarity).
'''

print("Simiar to: 'Montreal QC':\n", model.wv.most_similar('Montreal QC', topn=4, indexer=indexer))

Simiar to: 'Montreal QC':
 [('Montreal QC', 1.0), ('Los Angeles CA, Riverside CA, Long Beach CA, San Diego CA, Anaheim CA', 0.39490485191345215), ('Montreal QC, Toronto ON, Oshawa ON', 0.370272159576416), ('Indianapolis IN', 0.36812204122543335)]


In [7]:
print("Simiar to: 'New York NY':\n", model.wv.most_similar('New York NY', topn=4, indexer=indexer))


Simiar to: 'New York NY':
 [('New York NY', 1.0), ('Dallas TX, Plano TX, Austin TX', 0.37066179513931274), ('Los Angeles CA, Anaheim CA, Riverside CA, Santa Ana CA, Long Beach CA', 0.3703210949897766), ('San Diego CA, Anaheim CA, Santa Ana CA, Long Beach CA', 0.353828489780426)]


In [8]:
print("Simiar to: ['Chicago IL', 'New York NY']:\n", model.wv.most_similar(['Chicago IL', 'New York NY'], topn=4, indexer=indexer))


Simiar to: ['Chicago IL', 'New York NY']:
 [('Chicago IL', 0.6075745224952698), ('New York NY', 0.607574462890625), ('Houston TX, Dallas TX, Oklahoma City OK', 0.3799567222595215), ('Chicago IL, Madison WI', 0.37699031829833984)]


In [9]:
print("Simiar to: 'Edmonton AB':\n", model.wv.most_similar('Edmonton AB', topn=4, indexer=indexer))

Simiar to: 'Edmonton AB':
 [('Edmonton AB', 1.0), ('Vancouver BC, Seattle WA, Calgary AB', 0.3744924068450928), ('San Jose CA, Oakland CA', 0.36664819717407227), ('Vancouver BC, Seattle WA, Portland OR, Victoria BC', 0.3646078109741211)]


In [10]:
print("Simiar to: 'Jacksonville FL':\n", model.wv.most_similar('Jacksonville FL', topn=4, indexer=indexer))

Simiar to: 'Jacksonville FL':
 [('Jacksonville FL', 1.0), ('Philadelphia PA, Houston TX, Toronto ON', 0.4091346859931946), ('San Diego CA, Chicago IL, Montreal QC', 0.37535959482192993), ('Detroit MI, Los Angeles CA', 0.3660290241241455)]


In [11]:
print("Simiar to: ['Chicago IL', 'Los Angeles CA]:\n", model.wv.most_similar(['Los Angeles CA', 'Chicago IL'], topn=4, indexer=indexer))


Simiar to: ['Chicago IL', 'Los Angeles CA]:
 [('Chicago IL', 0.6124488711357117), ('Los Angeles CA', 0.6124488711357117), ('San Diego CA, Long Beach CA, Santa Ana CA, Anaheim CA', 0.39371854066848755), ('Houston TX, Corpus Christi TX, Arlington TX, Oklahoma City OK, Tulsa OK', 0.3811976909637451)]
