In [2]:
import pandas as pd
import numpy as np
import sqlite3
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import preprocessing as pp
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def get_df(table_name):
    try:
        conn = sqlite3.connect('/Users/tristannisbet/Documents/travel_app/places.db')

    except Exception as e:
        print('Error durring connection: ', str(e))
    
    sql = """select * from {}""".format(table_name)
    df = pd.read_sql_query(sql, conn)

    return df

In [4]:
one = get_df('restaurants_one')
two = get_df('restaurants_two')
three = get_df('restaurants_three')
four = get_df('restaurants_four')
top_rest = get_df('restaurants')


In [5]:
all_price = pd.concat([one, two, three, four, top_rest], axis =0)
all_price.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25533 entries, 0 to 8185
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             25533 non-null  object 
 1   city                25533 non-null  object 
 2   name                25533 non-null  object 
 3   address             25533 non-null  object 
 4   price_level         22748 non-null  float64
 5   rating              25533 non-null  float64
 6   user_ratings_total  25533 non-null  int64  
 7   types               25533 non-null  object 
 8   latitude            25381 non-null  float64
 9   longitude           25381 non-null  float64
 10  place_id            25533 non-null  object 
 11  id                  25533 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 2.5+ MB


In [6]:
all_price['id'] = pd.to_numeric(all_price.id)

In [7]:
all_price[all_price.city == 'Guilin']

Unnamed: 0,country,city,name,address,price_level,rating,user_ratings_total,types,latitude,longitude,place_id,id


In [8]:
all_price.price_level.isnull().any()

True

In [9]:
all_price.isnull().any()

country               False
city                  False
name                  False
address               False
price_level            True
rating                False
user_ratings_total    False
types                 False
latitude               True
longitude              True
place_id              False
id                    False
dtype: bool

In [10]:
def to_city(df):
    city_df = df.groupby(['country', 'city', 'id', 'price_level'])['name'].count().to_frame()
    price_level = city_df.pivot_table(index=['country', 'city', 'id'], columns='price_level', values='name', aggfunc='first')
    price_level['avg_price'] = df.groupby(['country', 'city', 'id'])['price_level'].mean()

    
    return price_level

In [11]:
food = to_city(all_price)
food

Unnamed: 0_level_0,Unnamed: 1_level_0,price_level,1.0,2.0,3.0,4.0,avg_price
country,city,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Argentina,Buenos Aires,85,6.0,80.0,68.0,31.0,2.670270
Australia,Melbourne,73,18.0,84.0,79.0,19.0,2.495000
Australia,Sydney,55,26.0,14.0,67.0,29.0,2.727941
Austria,Vienna,37,65.0,86.0,78.0,19.0,2.205645
Belgium,Brussels,60,10.0,89.0,44.0,18.0,2.434783
...,...,...,...,...,...,...,...
Uruguay,Montevideo,129,7.0,93.0,27.0,5.0,2.227273
Vietnam,Da Nang,90,18.0,78.0,8.0,,1.903846
Vietnam,Ha Long,48,15.0,23.0,1.0,,1.641026
Vietnam,Hanoi,52,22.0,81.0,15.0,1.0,1.957983


In [12]:
food.reset_index(inplace=True)
food.drop(columns = ['avg_price'], inplace=True)
food

price_level,country,city,id,1.0,2.0,3.0,4.0
0,Argentina,Buenos Aires,85,6.0,80.0,68.0,31.0
1,Australia,Melbourne,73,18.0,84.0,79.0,19.0
2,Australia,Sydney,55,26.0,14.0,67.0,29.0
3,Austria,Vienna,37,65.0,86.0,78.0,19.0
4,Belgium,Brussels,60,10.0,89.0,44.0,18.0
...,...,...,...,...,...,...,...
132,Uruguay,Montevideo,129,7.0,93.0,27.0,5.0
133,Vietnam,Da Nang,90,18.0,78.0,8.0,
134,Vietnam,Ha Long,48,15.0,23.0,1.0,
135,Vietnam,Hanoi,52,22.0,81.0,15.0,1.0


In [13]:
le = pp.LabelEncoder()
food['label_id'] = le.fit_transform(food.city)
food

price_level,country,city,id,1.0,2.0,3.0,4.0,label_id
0,Argentina,Buenos Aires,85,6.0,80.0,68.0,31.0,20
1,Australia,Melbourne,73,18.0,84.0,79.0,19.0,83
2,Australia,Sydney,55,26.0,14.0,67.0,29.0,121
3,Austria,Vienna,37,65.0,86.0,78.0,19.0,130
4,Belgium,Brussels,60,10.0,89.0,44.0,18.0,17
...,...,...,...,...,...,...,...,...
132,Uruguay,Montevideo,129,7.0,93.0,27.0,5.0,87
133,Vietnam,Da Nang,90,18.0,78.0,8.0,,34
134,Vietnam,Ha Long,48,15.0,23.0,1.0,,50
135,Vietnam,Hanoi,52,22.0,81.0,15.0,1.0,53


In [14]:
#Reverse encoder
#list(le.inverse_transform(food.label_id))

In [15]:
food_new = food[['label_id', 1.0, 2.0, 3.0, 4.0]].copy()
food_new

price_level,label_id,1.0,2.0,3.0,4.0
0,20,6.0,80.0,68.0,31.0
1,83,18.0,84.0,79.0,19.0
2,121,26.0,14.0,67.0,29.0
3,130,65.0,86.0,78.0,19.0
4,17,10.0,89.0,44.0,18.0
...,...,...,...,...,...
132,87,7.0,93.0,27.0,5.0
133,34,18.0,78.0,8.0,
134,50,15.0,23.0,1.0,
135,53,22.0,81.0,15.0,1.0


In [16]:
food_new.sort_values('label_id', inplace=True)
food_new.set_index('label_id', inplace=True)

In [17]:
food_new.fillna(0, inplace=True)

In [18]:
normalized = pp.normalize(food_new)
normalized_city = pd.DataFrame(normalized)
normalized_city

Unnamed: 0,0,1,2,3
0,0.578908,0.673811,0.436554,0.142354
1,0.093542,0.976996,0.187084,0.041574
2,0.133440,0.987457,0.080064,0.026688
3,0.073191,0.704468,0.704468,0.045745
4,0.475169,0.643777,0.590129,0.107296
...,...,...,...,...
132,0.473048,0.526457,0.625644,0.328082
133,0.000000,1.000000,0.000000,0.000000
134,0.000000,1.000000,0.000000,0.000000
135,0.135457,0.880471,0.406371,0.203186


### Survey / user food data

In [19]:
survey_food = pd.read_csv('/Users/tristannisbet/Documents/SM/survey_food_only.csv', index_col=0)
survey_food

Unnamed: 0,nationality,age,gender,food_one,food_two,food_three,food_four
0,United States,26-40,Female,Always,Always,Rarely,Never
1,United States,15-25,Female,Sometimes,Often,Sometimes,Sometimes
2,United States,15-25,Female,Sometimes,Often,Sometimes,Rarely
3,United States,26-40,Female,Sometimes,Often,Sometimes,Sometimes
4,United States,26-40,Female,Often,Often,Rarely,Rarely
...,...,...,...,...,...,...,...
148,Canada,15-25,Male,Sometimes,Often,Rarely,Rarely
149,United States,26-40,Male,Rarely,Often,Sometimes,Rarely
150,United States,26-40,Male,Sometimes,Often,Rarely,Rarely
151,United States,26-40,Male,Often,Always,Sometimes,Rarely


In [20]:
replace_map = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Always': 4}

df_food_replace = survey_food.replace(replace_map)

In [21]:
df_food_replace.drop(columns=['nationality', 'age', 'gender'], inplace=True)

In [22]:
normalized_user = pd.DataFrame(pp.normalize(df_food_replace))
normalized_user

Unnamed: 0,0,1,2,3
0,0.696311,0.696311,0.174078,0.000000
1,0.436436,0.654654,0.436436,0.436436
2,0.471405,0.707107,0.471405,0.235702
3,0.436436,0.654654,0.436436,0.436436
4,0.670820,0.670820,0.223607,0.223607
...,...,...,...,...
148,0.516398,0.774597,0.258199,0.258199
149,0.258199,0.774597,0.516398,0.258199
150,0.516398,0.774597,0.258199,0.258199
151,0.547723,0.730297,0.365148,0.182574


In [23]:
cosine_sim = pd.DataFrame(cosine_similarity(normalized_user, normalized_city))
cosine_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,127,128,129,130,131,132,133,134,135,136
0,0.948276,0.777994,0.794430,0.664125,0.881863,0.775324,0.901017,0.850985,0.803238,0.793378,...,0.778443,0.692994,0.960184,0.883276,0.746197,0.804877,0.696311,0.696311,0.778142,0.824331
1,0.946425,0.780214,0.751271,0.820546,0.933213,0.743685,0.921365,0.958680,0.913246,0.965157,...,0.981919,0.905368,0.921076,0.944892,0.894085,0.967342,0.654654,0.654654,0.901554,0.930533
2,0.988703,0.832928,0.805175,0.875508,0.982696,0.803272,0.950697,0.984734,0.956527,0.963111,...,0.963579,0.924461,0.964587,0.987282,0.935289,0.967519,0.707107,0.707107,0.925899,0.973623
3,0.946425,0.780214,0.751271,0.820546,0.933213,0.743685,0.921365,0.958680,0.913246,0.965157,...,0.981919,0.905368,0.921076,0.944892,0.894085,0.967342,0.654654,0.654654,0.901554,0.930533
4,0.969797,0.769268,0.775791,0.689423,0.906561,0.754167,0.924125,0.902097,0.833275,0.875199,...,0.875132,0.756829,0.968136,0.914991,0.783359,0.883748,0.670820,0.670820,0.817806,0.858688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,0.970352,0.864122,0.861352,0.777179,0.924119,0.846563,0.977309,0.921176,0.904251,0.890790,...,0.902439,0.834878,0.990292,0.931676,0.860096,0.898324,0.774597,0.774597,0.909347,0.906366
149,0.933596,0.888275,0.847571,0.940174,0.953801,0.855666,0.951658,0.958246,0.989890,0.935233,...,0.947471,0.973663,0.929037,0.956649,0.980109,0.937724,0.774597,0.774597,0.979296,0.981389
150,0.970352,0.864122,0.861352,0.777179,0.924119,0.846563,0.977309,0.921176,0.904251,0.890790,...,0.902439,0.834878,0.990292,0.931676,0.860096,0.898324,0.774597,0.774597,0.909347,0.906366
151,0.994560,0.840636,0.828333,0.820146,0.965484,0.819603,0.961319,0.958051,0.926153,0.925014,...,0.923428,0.866355,0.987197,0.969854,0.891054,0.931921,0.730297,0.730297,0.902680,0.941990


In [24]:
def find_similar_n(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index, 
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [25]:
top_10_city = find_similar_n(cosine_sim,10)
top_10_city

Unnamed: 0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
0,79,102,68,45,95,114,99,80,50,124
1,75,76,122,127,31,10,113,58,86,57
2,103,85,57,111,91,78,46,74,14,58
3,75,76,122,127,31,10,113,58,86,57
4,95,102,11,124,45,114,79,57,0,129
...,...,...,...,...,...,...,...,...,...,...
148,129,95,57,103,11,114,51,124,45,102
149,116,8,123,100,48,41,60,19,136,131
150,129,95,57,103,11,114,51,124,45,102
151,57,124,51,0,103,114,78,129,95,91


In [26]:
user_1 = top_10_city.iloc[0, 0:5].values.tolist()
user_1

[79, 102, 68, 45, 95]

In [27]:
for city in user_1:
    city2 = food[food.label_id == city]
    print(city2.city)

80    Manila
Name: city, dtype: object
84    Porto
Name: city, dtype: object
107    Krabi
Name: city, dtype: object
57    Florence
Name: city, dtype: object
127    Orlando
Name: city, dtype: object


# Attraction Similarity

In [28]:
city_attraction = pd.read_csv('/Users/tristannisbet/Documents/SM/city_attraction_only.csv', index_col=0)
city_attraction

Unnamed: 0_level_0,city,id,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Argentina,Buenos Aires,85,1,16,17,1,0,1,0,0,0,2,1
Australia,Melbourne,73,2,10,14,5,1,2,1,1,0,4,2
Australia,Sydney,55,1,8,15,2,1,2,0,0,1,2,1
Austria,Vienna,37,1,25,7,0,1,3,1,0,0,6,2
Belgium,Brussels,60,2,20,9,0,0,0,1,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Uruguay,Montevideo,129,1,18,9,0,0,0,1,0,0,2,1
Vietnam,Da Nang,90,1,7,5,0,0,0,0,0,0,14,1
Vietnam,Ha Long,48,1,2,1,0,0,0,0,0,1,0,1
Vietnam,Hanoi,52,0,11,6,0,0,1,0,0,0,9,0


In [29]:
le = pp.LabelEncoder()
city_attraction['label_id'] = le.fit_transform(city_attraction.city)
city_attraction

Unnamed: 0_level_0,city,id,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store,label_id
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Argentina,Buenos Aires,85,1,16,17,1,0,1,0,0,0,2,1,21
Australia,Melbourne,73,2,10,14,5,1,2,1,1,0,4,2,85
Australia,Sydney,55,1,8,15,2,1,2,0,0,1,2,1,123
Austria,Vienna,37,1,25,7,0,1,3,1,0,0,6,2,133
Belgium,Brussels,60,2,20,9,0,0,0,1,0,0,8,1,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Uruguay,Montevideo,129,1,18,9,0,0,0,1,0,0,2,1,89
Vietnam,Da Nang,90,1,7,5,0,0,0,0,0,0,14,1,35
Vietnam,Ha Long,48,1,2,1,0,0,0,0,0,1,0,1,52
Vietnam,Hanoi,52,0,11,6,0,0,1,0,0,0,9,0,55


In [30]:
city_attraction.sort_values('label_id', inplace=True)
city_attraction_clean = city_attraction.copy()
city_attraction_clean.set_index('label_id', inplace=True)
city_attraction_clean.drop(columns=['city', 'id'], inplace=True)
city_attraction_clean

Unnamed: 0_level_0,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
label_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,3,6,18,4,0,1,0,0,1,2,0
1,0,4,1,1,0,0,0,0,2,20,2
2,0,5,6,0,0,0,0,0,1,44,0
3,0,11,3,2,0,0,1,0,0,12,2
4,0,29,7,2,1,1,0,1,0,12,1
...,...,...,...,...,...,...,...,...,...,...,...
135,0,16,23,1,0,1,1,0,0,6,0
136,0,10,5,0,0,1,0,0,0,12,0
137,0,5,9,0,1,0,0,0,1,3,0
138,4,3,5,0,0,0,0,0,1,5,0


In [31]:
df = pd.read_csv('/Users/tristannisbet/Documents/SM/survey_responses.csv')

In [32]:
df.rename(columns = {'What country are you from? ': 'nationality', 'Age Range': 'age', 'Gender': 'gender',
                    '1. Choose your top favorite 3-5 cities you have traveled to that are on this list.  - Favorite City #1': 'favorite_city_one',
                    '2. Favorite city #2': 'favorite_city_two', '3. Favorite city #3': 'favorite_city_three',
                    '4. Favorite city #4': 'favorite_city_four', '5. Favorite city #5': 'favorite_city_five',
                    "6. If there's a city you have been and loved that is not on this list, add it below. ": 'extra_favorite',
                    "7. What cities on this list have you been to and not enjoyed?   - Least favorite city #1": 'least_favorite_one',
                    '8. Least favorite city #2': 'least_favorite_two', 
                    "9. If there's a city you have been to and haven't liked that is not on this list, add it below": 'extra_least_favorite',
                    "What price range of restaurant do you eat at when you travel? [Price level: 1 (Fast/Cheap Eats)]": 'food_one',
                    "What price range of restaurant do you eat at when you travel? [Price level: 2 (Casual Dining)]": 'food_two',
                    "What price range of restaurant do you eat at when you travel? [Price level: 3 (Upscale Dining)]": 'food_three',
                    "What price range of restaurant do you eat at when you travel? [Price level: 4 (Fine Dining/High End)]": 'food_four',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Art Gallery]": 'art_gallery',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Library]": 'library',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Museum ]": 'museum',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Aquarium]": 'aquarium',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Amusement Park ]": 'amusement_park',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Zoo]": 'zoo',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Movie Theater]": 'movie_theater',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Mall / Souvenir shop ]": 'store',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Park ]": 'park',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Natural Feature / Beach]": 'natural_feature',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Place of Worship (Church/Temple)]": 'place_of_worship'}, inplace=True )

In [33]:
user_attraction = df[['nationality', 'age', 'gender', 'amusement_park', 'museum', 'park', 'art_gallery', 'aquarium', 'zoo', 
                     'library', 'movie_theater', 'natural_feature', 'place_of_worship', 'store']].copy()
user_attraction

Unnamed: 0,nationality,age,gender,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
0,United States,26-40,Female,1 ( Would NOT go),2,4 (Definitely would go),2,1 ( Would NOT go),1 ( Would NOT go),2,2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go)
1,United States,15-25,Female,3,2,3,1 ( Would NOT go),2,3,1 ( Would NOT go),3,3,1 ( Would NOT go),3
2,United States,15-25,Female,4 (Definitely would go),3,4 (Definitely would go),3,4 (Definitely would go),4 (Definitely would go),2,1 ( Would NOT go),4 (Definitely would go),3,4 (Definitely would go)
3,United States,26-40,Female,2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),3,2,3,3,4 (Definitely would go),1 ( Would NOT go),3
4,United States,26-40,Female,2,3,4 (Definitely would go),2,1 ( Would NOT go),1 ( Would NOT go),3,3,4 (Definitely would go),1 ( Would NOT go),2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,Canada,15-25,Male,3,3,3,2,3,2,4 (Definitely would go),1 ( Would NOT go),3,4 (Definitely would go),2
149,United States,26-40,Male,4 (Definitely would go),4 (Definitely would go),2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),3,4 (Definitely would go),2,1 ( Would NOT go),4 (Definitely would go)
150,United States,26-40,Male,1 ( Would NOT go),3,3,3,2,2,2,1 ( Would NOT go),4 (Definitely would go),2,1 ( Would NOT go)
151,United States,26-40,Male,2,3,3,3,2,1 ( Would NOT go),2,1 ( Would NOT go),4 (Definitely would go),3,3


In [34]:
user_attraction.drop(columns=['nationality', 'age', 'gender'], inplace=True)

In [35]:
replace_map = {'1 ( Would NOT go)': 1, '2': 2, '3': 3, '4 (Definitely would go)': 4}

user_attraction = user_attraction.replace(replace_map)


In [36]:
user_attraction

Unnamed: 0,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
0,1,2,4,2,1,1,2,2,4,4,4
1,3,2,3,1,2,3,1,3,3,1,3
2,4,3,4,3,4,4,2,1,4,3,4
3,2,4,4,4,3,2,3,3,4,1,3
4,2,3,4,2,1,1,3,3,4,1,2
...,...,...,...,...,...,...,...,...,...,...,...
148,3,3,3,2,3,2,4,1,3,4,2
149,4,4,2,4,4,4,3,4,2,1,4
150,1,3,3,3,2,2,2,1,4,2,1
151,2,3,3,3,2,1,2,1,4,3,3


In [37]:
city_attraction_clean

Unnamed: 0_level_0,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
label_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,3,6,18,4,0,1,0,0,1,2,0
1,0,4,1,1,0,0,0,0,2,20,2
2,0,5,6,0,0,0,0,0,1,44,0
3,0,11,3,2,0,0,1,0,0,12,2
4,0,29,7,2,1,1,0,1,0,12,1
...,...,...,...,...,...,...,...,...,...,...,...
135,0,16,23,1,0,1,1,0,0,6,0
136,0,10,5,0,0,1,0,0,0,12,0
137,0,5,9,0,1,0,0,0,1,3,0
138,4,3,5,0,0,0,0,0,1,5,0


In [38]:
normalized_user_a = pd.DataFrame(pp.normalize(user_attraction))
normalized_user_a

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.109764,0.219529,0.439057,0.219529,0.109764,0.109764,0.219529,0.219529,0.439057,0.439057,0.439057
1,0.372104,0.248069,0.372104,0.124035,0.248069,0.372104,0.124035,0.372104,0.372104,0.124035,0.372104
2,0.353553,0.265165,0.353553,0.265165,0.353553,0.353553,0.176777,0.088388,0.353553,0.265165,0.353553
3,0.191565,0.383131,0.383131,0.383131,0.287348,0.191565,0.287348,0.287348,0.383131,0.095783,0.287348
4,0.232495,0.348743,0.464991,0.232495,0.116248,0.116248,0.348743,0.348743,0.464991,0.116248,0.232495
...,...,...,...,...,...,...,...,...,...,...,...
148,0.316228,0.316228,0.316228,0.210819,0.316228,0.210819,0.421637,0.105409,0.316228,0.421637,0.210819
149,0.350823,0.350823,0.175412,0.350823,0.350823,0.350823,0.263117,0.350823,0.175412,0.087706,0.350823
150,0.127000,0.381000,0.381000,0.381000,0.254000,0.254000,0.254000,0.127000,0.508001,0.254000,0.127000
151,0.230940,0.346410,0.346410,0.346410,0.230940,0.115470,0.230940,0.115470,0.461880,0.346410,0.346410


In [39]:
normalized_city_a = pd.DataFrame(pp.normalize(city_attraction_clean))
normalized_city_a

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.151717,0.303433,0.910299,0.202289,0.000000,0.050572,0.000000,0.000000,0.050572,0.101144,0.000000
1,0.000000,0.193801,0.048450,0.048450,0.000000,0.000000,0.000000,0.000000,0.096900,0.969003,0.096900
2,0.000000,0.111859,0.134231,0.000000,0.000000,0.000000,0.000000,0.000000,0.022372,0.984362,0.000000
3,0.000000,0.653882,0.178331,0.118888,0.000000,0.000000,0.059444,0.000000,0.000000,0.713326,0.118888
4,0.000000,0.898388,0.216852,0.061958,0.030979,0.030979,0.000000,0.030979,0.000000,0.371747,0.030979
...,...,...,...,...,...,...,...,...,...,...,...
135,0.000000,0.557386,0.801243,0.034837,0.000000,0.034837,0.034837,0.000000,0.000000,0.209020,0.000000
136,0.000000,0.608581,0.304290,0.000000,0.000000,0.060858,0.000000,0.000000,0.000000,0.730297,0.000000
137,0.000000,0.462250,0.832050,0.000000,0.092450,0.000000,0.000000,0.000000,0.092450,0.277350,0.000000
138,0.458831,0.344124,0.573539,0.000000,0.000000,0.000000,0.000000,0.000000,0.114708,0.573539,0.000000


In [40]:
cosine_sim_a = pd.DataFrame(cosine_similarity(normalized_user_a, normalized_city_a))
cosine_sim_a

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
0,0.599510,0.584991,0.525505,0.626382,0.496455,0.741575,0.565143,0.633738,0.609336,0.613448,...,0.553342,0.555368,0.519200,0.476627,0.560658,0.585044,0.594524,0.639306,0.679905,0.674164
1,0.545725,0.264418,0.208117,0.383401,0.399616,0.631361,0.384353,0.507457,0.564305,0.327191,...,0.490606,0.490290,0.230079,0.451129,0.419190,0.483947,0.377426,0.516016,0.583338,0.527408
2,0.572159,0.406831,0.346047,0.509652,0.465490,0.691231,0.484581,0.561016,0.575524,0.468295,...,0.493568,0.507774,0.370951,0.495340,0.502391,0.514219,0.484123,0.555662,0.648886,0.584634
3,0.610336,0.269160,0.197141,0.483963,0.519267,0.629385,0.447493,0.571325,0.599683,0.327608,...,0.538572,0.560349,0.262067,0.547443,0.526027,0.570584,0.431357,0.584437,0.538364,0.592815
4,0.652557,0.281611,0.226259,0.469894,0.496969,0.667033,0.454435,0.604498,0.635623,0.350828,...,0.595041,0.606552,0.266847,0.526351,0.504484,0.615551,0.445700,0.634079,0.613388,0.620617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,0.543739,0.546459,0.499939,0.639125,0.548598,0.712153,0.608048,0.612624,0.532363,0.605603,...,0.474161,0.505556,0.535262,0.547695,0.582943,0.547144,0.609425,0.584705,0.713385,0.617535
149,0.425805,0.229465,0.153047,0.422299,0.451027,0.495142,0.384671,0.422543,0.431971,0.276455,...,0.346911,0.369800,0.237935,0.491515,0.458092,0.388033,0.352282,0.381095,0.452725,0.430946
150,0.622999,0.418416,0.355154,0.573752,0.566543,0.693470,0.557013,0.621564,0.588398,0.465612,...,0.541734,0.568947,0.403430,0.570323,0.580412,0.601699,0.548758,0.634023,0.611852,0.636020
151,0.589797,0.514698,0.436574,0.631486,0.561610,0.705318,0.583511,0.618115,0.573536,0.544664,...,0.510462,0.535551,0.479254,0.552829,0.603106,0.567185,0.576237,0.608487,0.675512,0.638287


In [41]:
top_city_a = find_similar_n(cosine_sim_a,10)
top_city_a

Unnamed: 0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
0,100,43,40,5,26,12,72,30,87,115
1,100,41,52,103,65,5,85,72,129,97
2,100,41,103,72,43,5,52,85,57,129
3,85,100,52,61,105,67,45,117,43,115
4,52,85,65,43,123,5,28,41,115,105
...,...,...,...,...,...,...,...,...,...,...
148,100,43,72,92,26,57,103,40,138,23
149,100,52,41,85,61,103,107,117,45,77
150,100,85,72,43,105,115,61,45,5,103
151,100,43,52,72,45,57,85,40,5,115


In [42]:
user_1 = top_city_a.iloc[0, 0:5].values.tolist()
user_1

[100, 43, 40, 5, 26]

In [43]:
for city in user_1:
    city2 = food[food.label_id == city]
    print(city2.city)

128    Philadelphia
Name: city, dtype: object
33    Düsseldorf
Name: city, dtype: object
114    Dubai
Name: city, dtype: object
110    Antalya
Name: city, dtype: object
79    Cebu City
Name: city, dtype: object


In [44]:
top_city_a

Unnamed: 0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
0,100,43,40,5,26,12,72,30,87,115
1,100,41,52,103,65,5,85,72,129,97
2,100,41,103,72,43,5,52,85,57,129
3,85,100,52,61,105,67,45,117,43,115
4,52,85,65,43,123,5,28,41,115,105
...,...,...,...,...,...,...,...,...,...,...
148,100,43,72,92,26,57,103,40,138,23
149,100,52,41,85,61,103,107,117,45,77
150,100,85,72,43,105,115,61,45,5,103
151,100,43,52,72,45,57,85,40,5,115


In [45]:
top_10_city = top_10_city.add_suffix('food') 

In [46]:
top_city_a = top_city_a.add_suffix('a') 

In [47]:
top_city_a

Unnamed: 0,top1a,top2a,top3a,top4a,top5a,top6a,top7a,top8a,top9a,top10a
0,100,43,40,5,26,12,72,30,87,115
1,100,41,52,103,65,5,85,72,129,97
2,100,41,103,72,43,5,52,85,57,129
3,85,100,52,61,105,67,45,117,43,115
4,52,85,65,43,123,5,28,41,115,105
...,...,...,...,...,...,...,...,...,...,...
148,100,43,72,92,26,57,103,40,138,23
149,100,52,41,85,61,103,107,117,45,77
150,100,85,72,43,105,115,61,45,5,103
151,100,43,52,72,45,57,85,40,5,115


In [48]:
food[food.label_id == 100]

price_level,country,city,id,1.0,2.0,3.0,4.0,label_id
128,United States,Philadelphia,139,26.0,89.0,78.0,22.0,100


In [49]:
food.sort_values('label_id')

price_level,country,city,id,1.0,2.0,3.0,4.0,label_id
113,United Arab Emirates,Abu Dhabi,94,61.0,71.0,46.0,15.0,0
37,Ghana,Accra,135,9.0,94.0,18.0,4.0,1
43,India,Agra,26,10.0,74.0,6.0,2.0,2
66,Jordan,Amman,130,8.0,77.0,77.0,5.0,3
76,Netherlands,Amsterdam,25,62.0,84.0,77.0,14.0,4
...,...,...,...,...,...,...,...,...
131,United States,Washington D.C.,106,62.0,69.0,82.0,43.0,132
20,China,Xi'an,119,,11.0,,,133
21,China,Xiamen,123,,1.0,,,134
22,China,Zhuhai,68,2.0,13.0,6.0,3.0,135


In [50]:
city_attraction

Unnamed: 0_level_0,city,id,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store,label_id
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
United Arab Emirates,Abu Dhabi,94,3,6,18,4,0,1,0,0,1,2,0,0
Ghana,Accra,135,0,4,1,1,0,0,0,0,2,20,2,1
India,Agra,26,0,5,6,0,0,0,0,0,1,44,0,2
Jordan,Amman,130,0,11,3,2,0,0,1,0,0,12,2,3
Netherlands,Amsterdam,25,0,29,7,2,1,1,0,1,0,12,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United States,Washington D.C.,106,0,16,23,1,0,1,1,0,0,6,0,135
China,Xi'an,119,0,10,5,0,0,1,0,0,0,12,0,136
China,Xiamen,123,0,5,9,0,1,0,0,0,1,3,0,137
China,Zhuhai,68,4,3,5,0,0,0,0,0,1,5,0,138


In [51]:
df

Unnamed: 0,Timestamp,nationality,age,gender,favorite_city_one,favorite_city_two,favorite_city_three,favorite_city_four,favorite_city_five,extra_favorite,...,aquarium,amusement_park,zoo,movie_theater,store,park,natural_feature,place_of_worship,ID,id
0,6/1/20 9:42,United States,26-40,Female,Krabi,Rome,Budapest,,,"Hakone, Japan",...,1 ( Would NOT go),1 ( Would NOT go),1 ( Would NOT go),2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),,0.0
1,6/1/20 9:49,United States,15-25,Female,Cancun,Las Vegas,Los Angeles,New York City,London,,...,2,3,3,3,3,3,3,1 ( Would NOT go),,1.0
2,6/1/20 9:57,United States,15-25,Female,Sydney,London,Dublin,Prague,Rome,,...,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),3,,2.0
3,6/1/20 10:16,United States,26-40,Female,Rome,London,Florence,Mexico City,Munich,Astoria,...,3,2,2,3,3,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),,
4,6/1/20 10:17,United States,26-40,Female,Denpasar,Shanghai,Kyoto,New York City,Vancouver,,...,1 ( Would NOT go),2,1 ( Would NOT go),3,2,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,6/13/20 2:11,Canada,15-25,Male,Melbourne,Vancouver,Sydney,,,,...,3,3,2,1 ( Would NOT go),2,3,3,4 (Definitely would go),,
149,6/13/20 14:52,United States,26-40,Male,Atlanta,New York City,Cairo,Honolulu,San Francisco,Seattle,...,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),2,2,1 ( Would NOT go),,
150,6/14/20 1:50,United States,26-40,Male,Berlin,Los Angeles,Istanbul,,,,...,2,1 ( Would NOT go),2,1 ( Would NOT go),1 ( Would NOT go),3,4 (Definitely would go),2,,
151,6/16/20 22:28,United States,26-40,Male,Florence,Miami,Las Vegas,,,,...,2,2,1 ( Would NOT go),1 ( Would NOT go),3,3,4 (Definitely would go),3,,


In [52]:
fav_city = df.iloc[:, 4:9].copy()


In [53]:
fav_city

Unnamed: 0,favorite_city_one,favorite_city_two,favorite_city_three,favorite_city_four,favorite_city_five
0,Krabi,Rome,Budapest,,
1,Cancun,Las Vegas,Los Angeles,New York City,London
2,Sydney,London,Dublin,Prague,Rome
3,Rome,London,Florence,Mexico City,Munich
4,Denpasar,Shanghai,Kyoto,New York City,Vancouver
...,...,...,...,...,...
148,Melbourne,Vancouver,Sydney,,
149,Atlanta,New York City,Cairo,Honolulu,San Francisco
150,Berlin,Los Angeles,Istanbul,,
151,Florence,Miami,Las Vegas,,


In [54]:
fav_city.sample()

Unnamed: 0,favorite_city_one,favorite_city_two,favorite_city_three,favorite_city_four,favorite_city_five
84,Barcelona,Brussels,Boston,Edinburgh,Sydney


In [55]:
user_29 = fav_city.iloc[29]
user_29

favorite_city_one                Kyoto
favorite_city_two                Tokyo
favorite_city_three    Washington D.C.
favorite_city_four         Los Angeles
favorite_city_five       New York City
Name: 29, dtype: object

In [56]:
#Currently just using a single user. 
def food_sim(city):
    sim_list_f = []
    for c in city:
        info = food[food.city == c]
        label = info.label_id
        label = label.iloc[0]
        sim_score = cosine_sim.iloc[29, label]
        sim_list_f.append(sim_score)
    return sim_list_f

def attraction_sim(city):
    sim_list_a = []
    for c in city:
        info = city_attraction[city_attraction.city == c]
        label = info.label_id
        label = label.iloc[0]
        sim_score = cosine_sim_a.iloc[29, label]
        sim_list_a.append(sim_score)
    return sim_list_a


# Add to the dataframe for each user. 
# Maybe have all sim score, then the top 5 countries as rows after.
# Still need to multiply for higher ranked city. 
def attraction_food_sim(attraction_sim, food_sim):
    dataf = pd.DataFrame()
    sim_df = list( map(add, attraction_sim, food_sim) )
    
    

In [57]:
ok = food_sim(user_29)
ok

[0.8263980679802435,
 0.8046776948003729,
 0.9545551677853725,
 0.938246472187091,
 0.8836921919038008]

In [58]:
oka = attraction_sim(user_29)
oka

[0.5810845055677971,
 0.7688780475035983,
 0.6183245801773534,
 0.7174632137890834,
 0.6592559760604247]

In [59]:
from operator import add
bla2 = list( map(add, ok, oka) )
bla2

[1.4074825735480405,
 1.5735557423039712,
 1.572879747962726,
 1.6557096859761744,
 1.5429481679642256]

In [61]:
df = pd.DataFrame(bla2)

In [62]:
df

Unnamed: 0,0
0,1.407483
1,1.573556
2,1.57288
3,1.65571
4,1.542948


In [None]:
df

In [None]:
dff = pd.DataFrame(bla).T

In [None]:
print(bla2)

In [None]:
dff

In [None]:
#df.loc['two'] = [4, 5, 6]
dff.loc[1] = bla2

In [None]:
dff

In [None]:
dataf = pd.DataFrame()

In [None]:
bla

In [None]:
dataf.concat(bla)

In [None]:
food

In [None]:
diff = set(food.city).symmetric_difference(set(city_attraction.city))
diff = list(diff)
diff

In [None]:
food[food.city == 'Tehran']

In [None]:
type(oka)

In [None]:
oka

In [None]:
cosine_sim.iloc[29, 71]