In [1]:
# Imports
# Basics
from __future__ import print_function, division
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics.pairwise as smp

from sklearn.decomposition import NMF

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [90]:
from features import wiki_search

In [15]:
df = pd.read_csv('destinations.csv')

df.head()

df = df.dropna()

In [17]:
X_train[137]

"The Fortress of Louisbourg (French: Forteresse de Louisbourg) is a National Historic Site of Canada and the location of a one-quarter partial reconstruction of an 18th-century French fortress at Louisbourg on Cape Breton Island, Nova Scotia. Its two sieges, especially that of 1758, were turning points in the Anglo-French struggle for what today is Canada. The original settlement was made in 1713, and initially called Havre  l'Anglois. Subsequently, the fishing port grew to become a major commercial port and a strongly defended fortress. The fortifications eventually surrounded the town. The walls were constructed mainly between 1720 and 1740. By the mid-1740s Louisbourg was one of the most extensive (and expensive) European fortifications constructed in North America. It was supported by two smaller garrisons on le Royale located at present-day St. Peter's and Englishtown. The Fortress of Louisbourg suffered key weaknesses, since it was erected on low-lying ground commanded by nearby 

In [16]:
X_train = df['details'].tolist()

In [27]:
X_label = df['location'].tolist()

In [29]:
len(X_label)

986

In [18]:
tfidf = TfidfVectorizer(stop_words="english", 
                        token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b", 
                        min_df=1, encoding = 'utf-8')

In [19]:
tfidf_vecs = tfidf.fit_transform(X_train)

In [20]:
tfidf_vecs

<986x17458 sparse matrix of type '<type 'numpy.float64'>'
	with 70013 stored elements in Compressed Sparse Row format>

In [21]:
len(tfidf.get_feature_names())

17458

In [22]:
tfidf_corpus = matutils.Sparse2Corpus(tfidf_vecs.transpose())

# Row indices
id2word = dict((v, k) for k, v in tfidf.vocabulary_.items())

# This is a hack for Python 3!
id2word = corpora.Dictionary.from_corpus(tfidf_corpus, 
                                         id2word=id2word)

In [50]:
# Build an LSI space from the input TFIDF matrix, mapping of row id to word, and num_topics
# num_topics is the number of dimensions to reduce to after the SVD
# Analagous to "fit" in sklearn, it primes an LSI space
lsi = models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=15)

# Retrieve vectors for the original tfidf corpus in the LSI space ("transform" in sklearn)
lsi_corpus = lsi[tfidf_corpus]

# Dump the resulting document vectors into a list so we can take a look
doc_vecs = [doc for doc in lsi_corpus]


# Create an index transformer that calculates similarity based on our space
index = similarities.MatrixSimilarity(doc_vecs, 
                                      num_features=len(id2word))

# Return the sorted list of cosine similarities to the first document
sims = sorted(enumerate(index[doc_vecs[26]]), key=lambda item: -item[1])
sims_topics = [X_label[sim[0]] for sim in sims]
zip(sims, sims_topics)

[((26, 1.0), 'Monument Valley'),
 ((90, 0.96362275), 'Meuse Valley'),
 ((853, 0.95280659), 'Matterhorn'),
 ((135, 0.94393647), 'Canadian Rockies'),
 ((714, 0.92775166), 'Nazca Lines'),
 ((665, 0.91447735), 'Popa Taungkalat'),
 ((851, 0.9117772), 'Jungfrau Cog Railway'),
 ((946, 0.90999526), 'Merida to Pico Bolivar Cable Car'),
 ((769, 0.90653133), 'Hejaz Railroad Ride'),
 ((27, 0.90316486), 'Mt Rushmore'),
 ((765, 0.90304351), 'Valley of the Geysers'),
 ((124, 0.90299708), 'Rila Monastery'),
 ((715, 0.90125668), 'Banaue Rice Terraces'),
 ((801, 0.89400685), 'Kyongju Tombs & Museum'),
 ((61, 0.89206457), 'Great Ocean Road'),
 ((668, 0.89076126), 'Fish River Canyon'),
 ((312, 0.8842715), 'Dordogne Noir'),
 ((120, 0.8827827), 'Sugar Loaf Mountain'),
 ((86, 0.88173378), 'Ardennes'),
 ((913, 0.87726116), 'Mt Ararat'),
 ((497, 0.87483346), 'Masada'),
 ((984, 0.87302393), 'Rift Valley'),
 ((791, 0.87209046), 'Drakensberg Mountains'),
 ((462, 0.87021947), 'Varanasi Riverside Ghats'),
 ((176, 0

In [52]:
# Convert the gensim-style corpus vecs to a numpy array for sklearn manipulations
ng_lsi = matutils.corpus2dense(lsi_corpus, num_terms=300).transpose()
ng_lsi.shape

(986, 300)

In [55]:
# Create KMeans
kmeans = KMeans(n_clusters=15)

# Cluster
ng_lsi_clusters = kmeans.fit_predict(ng_lsi)

# Take a look
zip(ng_lsi_clusters, X_label)

[(2, 'Jam Minaret'),
 (1, 'Kabul Old City'),
 (12, 'Khyber Pass\xc2\xa0\xc2\xa0[with Pakistan]'),
 (5, "Al Qal'a of Beni Hammad"),
 (2, 'Algiers Kasbah'),
 (11, 'Djemila'),
 (2, "M'zab Valley"),
 (2, 'Alaska Cruise'),
 (9, 'Alcatraz Island'),
 (2, 'American Museum of Natural Hist'),
 (2, 'Bayous'),
 (0, 'Bryce Canyon'),
 (0, 'Carlsbad Caverns'),
 (0, 'Denali National Park'),
 (12, 'Devils Tower'),
 (12, 'Grand Canyon'),
 (12, 'Grand Tetons'),
 (2, 'Guggenheim Museum'),
 (0, 'Hawaii Volcanoes National Park'),
 (2, 'Kennedy Space Center'),
 (2, 'Las Vegas Strip at Night'),
 (0, 'Mammoth Cave'),
 (2, 'Mesa Verde'),
 (2, 'Meteor Crater'),
 (6, 'Metropolitan Museum of Art'),
 (2, 'Monterey Aquarium'),
 (12, 'Monument Valley'),
 (2, 'Mt Rushmore'),
 (6, 'Museum of Modern Art'),
 (0, 'Na Pali Coast'),
 (6, 'National Gallery of Arts'),
 (1, 'New York Skyline View'),
 (12, 'Niagara Falls\xc2\xa0\xc2\xa0[with Canada]'),
 (0, 'Redwoods National Park'),
 (2, 'San Diego Zoo'),
 (1, 'San Francisco B

In [56]:
df2 = df

In [57]:
df2.head()

Unnamed: 0.1,Unnamed: 0,details,location
0,0,The Minaret of Jam is a UNESCO World Heritage ...,Jam Minaret
1,1,"Kabul (/kbl/; Pashto: , Persian: , pronounced ...",Kabul Old City
2,2,"The Khyber Pass (Pashto: , Urdu: ) (elevati...",Khyber Pass [with Pakistan]
3,3,"Beni Hammad Fort, also called Al Qal'a of Beni...",Al Qal'a of Beni Hammad
4,4,"The Casbah (Arabic: , qaba, meaning citadel (f...",Algiers Kasbah


In [58]:
df2['verbosity'] = df2['details'].apply(lambda x:len(x))

In [59]:
df2.head()

Unnamed: 0.1,Unnamed: 0,details,location,verbosity
0,0,The Minaret of Jam is a UNESCO World Heritage ...,Jam Minaret,690
1,1,"Kabul (/kbl/; Pashto: , Persian: , pronounced ...",Kabul Old City,1405
2,2,"The Khyber Pass (Pashto: , Urdu: ) (elevati...",Khyber Pass [with Pakistan],600
3,3,"Beni Hammad Fort, also called Al Qal'a of Beni...",Al Qal'a of Beni Hammad,1255
4,4,"The Casbah (Arabic: , qaba, meaning citadel (f...",Algiers Kasbah,408


In [62]:
df3 = df2.sort_values('verbosity', ascending=False)

In [69]:
df3[df3['location'] == 'Tokyo Fish Market']

Unnamed: 0.1,Unnamed: 0,details,location,verbosity
573,573,"The Tsukiji Market (, Tsukiji shij), supervise...",Tokyo Fish Market,657


In [70]:
import gensim

In [71]:
model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 

In [87]:
from scipy import spatial

In [85]:
def avg_feature_vector(words, model, num_features):
        #function to average all words vectors in a given paragraph
        featureVec = np.zeros((num_features,), dtype="float32")
        nwords = 0

        #list containing names of words in the vocabulary
        #index2word_set = set(model.index2word) this is moved as input param for performance reasons
        for word in words:
            try:
                nwords = nwords+1
                featureVec = np.add(featureVec, model[word])
            except:
                pass

        if(nwords>0):
            featureVec = np.divide(featureVec, nwords)
        return featureVec

In [124]:
vecs = np.zeros([df.shape[0], 300])
vecs_sr = df['details'].apply(lambda x: avg_feature_vector(x.split(), model=model, num_features=300))

In [184]:
vect = []
for vec in vecs_sr:
    vect.extend(vec)
vecs = np.array(vect).reshape([df.shape[0], 300])

In [189]:
# Create KMeans
kmeans = KMeans(n_clusters=15)

# Cluster
vecs_clusters = kmeans.fit_predict(vecs)

# Take a look
zip(vecs_clusters, X_label)

[(7, 'Jam Minaret'),
 (12, 'Kabul Old City'),
 (14, 'Khyber Pass\xc2\xa0\xc2\xa0[with Pakistan]'),
 (7, "Al Qal'a of Beni Hammad"),
 (7, 'Algiers Kasbah'),
 (7, 'Djemila'),
 (9, "M'zab Valley"),
 (12, 'Alaska Cruise'),
 (12, 'Alcatraz Island'),
 (5, 'American Museum of Natural Hist'),
 (10, 'Bayous'),
 (6, 'Bryce Canyon'),
 (6, 'Carlsbad Caverns'),
 (6, 'Denali National Park'),
 (14, 'Devils Tower'),
 (14, 'Grand Canyon'),
 (14, 'Grand Tetons'),
 (5, 'Guggenheim Museum'),
 (6, 'Hawaii Volcanoes National Park'),
 (12, 'Kennedy Space Center'),
 (8, 'Las Vegas Strip at Night'),
 (14, 'Mammoth Cave'),
 (14, 'Mesa Verde'),
 (14, 'Meteor Crater'),
 (5, 'Metropolitan Museum of Art'),
 (10, 'Monterey Aquarium'),
 (14, 'Monument Valley'),
 (12, 'Mt Rushmore'),
 (5, 'Museum of Modern Art'),
 (6, 'Na Pali Coast'),
 (5, 'National Gallery of Arts'),
 (12, 'New York Skyline View'),
 (12, 'Niagara Falls\xc2\xa0\xc2\xa0[with Canada]'),
 (6, 'Redwoods National Park'),
 (6, 'San Diego Zoo'),
 (8, 'San F

In [186]:
df[vecs_clusters==0]

Unnamed: 0.1,Unnamed: 0,details,location,verbosity
263,263,The great Mosque of Muhammad Ali Pasha or Alab...,Mohammed Ali Mosque,519
424,424,Chand Baori is a stepwell situated in the vill...,Chand Baori,107
432,432,Sri Harmandir Sahib (The abode of God) (Punjab...,Golden Temple,1939
436,436,Humayun's tomb (Persian: Maqbara e Humayun T...,Humayun's Tomb,3003
439,439,"Jama Masjid (also spelled Jame Mosque, Jami Ma...",Jama Masjid,391
694,694,The Sultan Qaboos Grand Mosque is the main Mos...,Sultan Qaboos Grand Mosque,112
695,695,"The Badshahi Mosque (Punjabi, Urdu: , or Impe...",Badshahi Mosque,892
696,696,"The Tomb of Jahangir (Urdu: , Western Punjabi...",Emperor Jehangir's Tomb,206
702,702,"Moti Masjid (Punjabi, Urdu: ), one of the ""Pe...",Lahore Fort,415
703,703,"Rohtas Fort (Punjabi, Urdu: Qila Rohtas) is ...",Rohtas Fort,838


In [187]:
df[vecs_clusters==1]

Unnamed: 0.1,Unnamed: 0,details,location,verbosity
1,1,"Kabul (/kbl/; Pashto: , Persian: , pronounced ...",Kabul Old City,1405
2,2,"The Khyber Pass (Pashto: , Urdu: ) (elevati...",Khyber Pass [with Pakistan],600
20,20,The Las Vegas Strip is a stretch of Las Vegas ...,Las Vegas Strip at Night,1062
57,57,"Ashtarak (Armenian: ), is a town in the Aragat...",Ashtarak Ancient Fortress,729
76,76,Salzburg (German pronunciation: [zaltsbk]; Bav...,Salzburg Old Town & Castle,735
87,87,Bruges (/bru/; Dutch: Brugge [br]; French: Bru...,Bruges,1137
88,88,Ghent (/nt/; Dutch: Gent pronounced [nt]; Fren...,Ghent,1330
110,110,Braslia (Portuguese pronunciation: [bazilj]) i...,Brasilia,2467
167,167,"Kaifeng (Chinese: ), known previously by sever...",Kaifeng Historical Sites,542
190,190,The Silk Road or Silk Route was an ancient net...,Silk Road,1643


In [188]:
#get average vector for sentence 1
sentence_1 = 'Osaka'
sentence_1_avg_vector = avg_feature_vector(sentence_1.split(), model=model, num_features=300)

#get average vector for sentence 2
sentence_2 = 'Football'
sentence_2_avg_vector = avg_feature_vector(sentence_2.split(), model=model, num_features=300)

sen1_sen2_similarity =  1 - spatial.distance.cosine(sentence_1_avg_vector,sentence_2_avg_vector)
sen1_sen2_similarity

-0.011567999464298317

In [119]:
#get average vector for sentence 1
sentence_1 = 'London'
sentence_1_avg_vector = avg_feature_vector(sentence_1.split(), model=model, num_features=300)

#get average vector for sentence 2
sentence_2 = 'Football'
sentence_2_avg_vector = avg_feature_vector(sentence_2.split(), model=model, num_features=300)

sen1_sen2_similarity =  1 - spatial.distance.cosine(sentence_1_avg_vector,sentence_2_avg_vector)
sen1_sen2_similarity

0.034144981078519265

In [118]:
#get average vector for sentence 1
sentence_1 = 'Ann Arbor'
sentence_1_avg_vector = avg_feature_vector(sentence_1.split(), model=model, num_features=300)

#get average vector for sentence 2
sentence_2 = 'Football'
sentence_2_avg_vector = avg_feature_vector(sentence_2.split(), model=model, num_features=300)

sen1_sen2_similarity =  1 - spatial.distance.cosine(sentence_1_avg_vector,sentence_2_avg_vector)
sen1_sen2_similarity

0.045038382349974149

In [121]:
#get average vector for sentence 1
sentence_1 = 'San Francisco'
sentence_1_avg_vector = avg_feature_vector(sentence_1.split(), model=model, num_features=300)

#get average vector for sentence 2
sentence_2 = 'Football'
sentence_2_avg_vector = avg_feature_vector(sentence_2.split(), model=model, num_features=300)

sen1_sen2_similarity =  1 - spatial.distance.cosine(sentence_1_avg_vector,sentence_2_avg_vector)
sen1_sen2_similarity

-0.0081271449437698706