In [1]:
import pandas as pd
import numpy as np
import sqlite3
import ast
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
def get_df(table_name):
    try:
        conn = sqlite3.connect('/Users/tristannisbet/Documents/travel_app/places.db')

    except Exception as e:
        print('Error durring connection: ', str(e))
    
    sql = """select * from {}""".format(table_name)
    df = pd.read_sql_query(sql, conn)

    return df

In [3]:
def find_similar_n(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index, 
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [4]:
user_response = pd.read_csv('/Users/tristannisbet/Documents/SM/survey_responses.csv')

In [5]:
user_response.rename(columns = {'What country are you from? ': 'nationality', 'Age Range': 'age', 'Gender': 'gender',
                    '1. Choose your top favorite 3-5 cities you have traveled to that are on this list.  - Favorite City #1': 'favorite_city_one',
                    '2. Favorite city #2': 'favorite_city_two', '3. Favorite city #3': 'favorite_city_three',
                    '4. Favorite city #4': 'favorite_city_four', '5. Favorite city #5': 'favorite_city_five',
                    "6. If there's a city you have been and loved that is not on this list, add it below. ": 'extra_favorite',
                    "7. What cities on this list have you been to and not enjoyed?   - Least favorite city #1": 'least_favorite_one',
                    '8. Least favorite city #2': 'least_favorite_two', 
                    "9. If there's a city you have been to and haven't liked that is not on this list, add it below": 'extra_least_favorite',
                    "What price range of restaurant do you eat at when you travel? [Price level: 1 (Fast/Cheap Eats)]": 'food_one',
                    "What price range of restaurant do you eat at when you travel? [Price level: 2 (Casual Dining)]": 'food_two',
                    "What price range of restaurant do you eat at when you travel? [Price level: 3 (Upscale Dining)]": 'food_three',
                    "What price range of restaurant do you eat at when you travel? [Price level: 4 (Fine Dining/High End)]": 'food_four',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Art Gallery]": 'art_gallery',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Library]": 'library',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Museum ]": 'museum',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Aquarium]": 'aquarium',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Amusement Park ]": 'amusement_park',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Zoo]": 'zoo',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Movie Theater]": 'movie_theater',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Mall / Souvenir shop ]": 'shop',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Park ]": 'park',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Natural Feature / Beach]": 'natural_feature',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Place of Worship (Church/Temple)]": 'place_of_worship'}, inplace=True )

In [6]:
user_response.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Timestamp             153 non-null    object
 1   nationality           153 non-null    object
 2   age                   153 non-null    object
 3   gender                153 non-null    object
 4   favorite_city_one     153 non-null    object
 5   favorite_city_two     151 non-null    object
 6   favorite_city_three   143 non-null    object
 7   favorite_city_four    116 non-null    object
 8   favorite_city_five    105 non-null    object
 9   extra_favorite        89 non-null     object
 10  least_favorite_one    117 non-null    object
 11  least_favorite_two    89 non-null     object
 12  extra_least_favorite  29 non-null     object
 13  food_one              153 non-null    object
 14  food_two              153 non-null    object
 15  food_three            153 non-null    ob

In [7]:
user_response.fillna('Zx', inplace=True)

In [8]:
user_response.drop(columns=['Timestamp', 'extra_favorite', 'extra_least_favorite', 'least_favorite_one', 'least_favorite_two'], inplace=True)

In [9]:
nationality_dict = {'Australia': 1, 'Canada': 2, 'China': 3, 'Finland': 4, 'Honduras': 5,
              'India': 6, 'Israel': 7, 'Japan': 8, 'Mexico': 9, 'Pakistan': 10, 'Philippines': 11, 'United States': 12}

user_response.nationality = user_response.nationality.map(nationality_dict)


In [10]:
user_response

Unnamed: 0,nationality,age,gender,favorite_city_one,favorite_city_two,favorite_city_three,favorite_city_four,favorite_city_five,food_one,food_two,...,library,museum,aquarium,amusement_park,zoo,movie_theater,shop,park,natural_feature,place_of_worship
0,12,26-40,Female,Krabi,Rome,Budapest,Zx,Zx,Always,Always,...,2,2,1 ( Would NOT go),1 ( Would NOT go),1 ( Would NOT go),2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go)
1,12,15-25,Female,Cancun,Las Vegas,Los Angeles,New York City,London,Sometimes,Often,...,1 ( Would NOT go),2,2,3,3,3,3,3,3,1 ( Would NOT go)
2,12,15-25,Female,Sydney,London,Dublin,Prague,Rome,Sometimes,Often,...,2,3,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),3
3,12,26-40,Female,Rome,London,Florence,Mexico City,Munich,Sometimes,Often,...,3,4 (Definitely would go),3,2,2,3,3,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go)
4,12,26-40,Female,Denpasar,Shanghai,Kyoto,New York City,Vancouver,Often,Often,...,3,3,1 ( Would NOT go),2,1 ( Would NOT go),3,2,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,15-25,Male,Melbourne,Vancouver,Sydney,Zx,Zx,Sometimes,Often,...,4 (Definitely would go),3,3,3,2,1 ( Would NOT go),2,3,3,4 (Definitely would go)
149,12,26-40,Male,Atlanta,New York City,Cairo,Honolulu,San Francisco,Rarely,Often,...,3,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),2,2,1 ( Would NOT go)
150,12,26-40,Male,Berlin,Los Angeles,Istanbul,Zx,Zx,Sometimes,Often,...,2,3,2,1 ( Would NOT go),2,1 ( Would NOT go),1 ( Would NOT go),3,4 (Definitely would go),2
151,12,26-40,Male,Florence,Miami,Las Vegas,Zx,Zx,Often,Always,...,2,3,2,2,1 ( Would NOT go),1 ( Would NOT go),3,3,4 (Definitely would go),3


In [11]:
db = get_df('cities')
db = db.drop(db[db.id == '80'].index)

In [12]:
new_row = {'id': 200, 'city': 'Zx', 'country': 'None'}

db = db.append(new_row, ignore_index=True)

In [13]:
db[db['id'] == '80']

Unnamed: 0,id,city,country


In [14]:
# Remove user with favorite city = Guilin - no restuarants were found not enough data
user_response.loc[108, 'favorite_city_four'] = 'Zx'

In [15]:
from sklearn import preprocessing as pp

le = pp.LabelEncoder()
le.fit(db.city)
#numpy.save('classes.npy', encoder.classes_)

user_response['one'] = le.transform(user_response['favorite_city_one'])
user_response

Unnamed: 0,nationality,age,gender,favorite_city_one,favorite_city_two,favorite_city_three,favorite_city_four,favorite_city_five,food_one,food_two,...,museum,aquarium,amusement_park,zoo,movie_theater,shop,park,natural_feature,place_of_worship,one
0,12,26-40,Female,Krabi,Rome,Budapest,Zx,Zx,Always,Always,...,2,1 ( Would NOT go),1 ( Would NOT go),1 ( Would NOT go),2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),68
1,12,15-25,Female,Cancun,Las Vegas,Los Angeles,New York City,London,Sometimes,Often,...,2,2,3,3,3,3,3,3,1 ( Would NOT go),23
2,12,15-25,Female,Sydney,London,Dublin,Prague,Rome,Sometimes,Often,...,3,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),3,122
3,12,26-40,Female,Rome,London,Florence,Mexico City,Munich,Sometimes,Often,...,4 (Definitely would go),3,2,2,3,3,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),111
4,12,26-40,Female,Denpasar,Shanghai,Kyoto,New York City,Vancouver,Often,Often,...,3,1 ( Would NOT go),2,1 ( Would NOT go),3,2,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,15-25,Male,Melbourne,Vancouver,Sydney,Zx,Zx,Sometimes,Often,...,3,3,3,2,1 ( Would NOT go),2,3,3,4 (Definitely would go),83
149,12,26-40,Male,Atlanta,New York City,Cairo,Honolulu,San Francisco,Rarely,Often,...,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),2,2,1 ( Would NOT go),7
150,12,26-40,Male,Berlin,Los Angeles,Istanbul,Zx,Zx,Sometimes,Often,...,3,2,1 ( Would NOT go),2,1 ( Would NOT go),1 ( Would NOT go),3,4 (Definitely would go),2,15
151,12,26-40,Male,Florence,Miami,Las Vegas,Zx,Zx,Often,Always,...,3,2,2,1 ( Would NOT go),1 ( Would NOT go),3,3,4 (Definitely would go),3,45


In [16]:
user_response['two'] = le.transform(user_response['favorite_city_two'])
user_response['three'] = le.transform(user_response['favorite_city_three'])
user_response['four'] = le.transform(user_response['favorite_city_four'])
user_response['five'] = le.transform(user_response['favorite_city_five'])


In [17]:
user_response.movie_theater.value_counts()

2                          63
1 ( Would NOT go)          56
3                          21
4 (Definitely would go)    13
Name: movie_theater, dtype: int64

In [18]:
slim_response = user_response.drop(columns=['favorite_city_one', 'favorite_city_two', 'favorite_city_three',
                                          'favorite_city_four', 'favorite_city_five'])

In [19]:
slim_response

Unnamed: 0,nationality,age,gender,food_one,food_two,food_three,food_four,art_gallery,library,museum,...,movie_theater,shop,park,natural_feature,place_of_worship,one,two,three,four,five
0,12,26-40,Female,Always,Always,Rarely,Never,2,2,2,...,2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),68,111,20,138,138
1,12,15-25,Female,Sometimes,Often,Sometimes,Sometimes,1 ( Would NOT go),1 ( Would NOT go),2,...,3,3,3,3,1 ( Would NOT go),23,72,76,93,75
2,12,15-25,Female,Sometimes,Often,Sometimes,Rarely,3,2,3,...,1 ( Would NOT go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),3,122,75,41,103,111
3,12,26-40,Female,Sometimes,Often,Sometimes,Sometimes,4 (Definitely would go),3,4 (Definitely would go),...,3,3,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),111,75,45,84,92
4,12,26-40,Female,Often,Often,Rarely,Rarely,2,3,3,...,3,2,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),39,117,71,93,129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,15-25,Male,Sometimes,Often,Rarely,Rarely,2,4 (Definitely would go),3,...,1 ( Would NOT go),2,3,3,4 (Definitely would go),83,129,122,138,138
149,12,26-40,Male,Rarely,Often,Sometimes,Rarely,4 (Definitely would go),3,4 (Definitely would go),...,4 (Definitely would go),4 (Definitely would go),2,2,1 ( Would NOT go),7,93,22,57,113
150,12,26-40,Male,Sometimes,Often,Rarely,Rarely,3,2,3,...,1 ( Would NOT go),1 ( Would NOT go),3,4 (Definitely would go),2,15,76,60,138,138
151,12,26-40,Male,Often,Always,Sometimes,Rarely,3,2,3,...,1 ( Would NOT go),3,3,4 (Definitely would go),3,45,85,72,138,138


In [20]:
replace_map = {'1 ( Would NOT go)': 1, '2': 2, '3': 3, '4 (Definitely would go)': 4}

slim_response = slim_response.replace(replace_map)

In [21]:
replace_map_food = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Always': 4}

slim_response = slim_response.replace(replace_map_food)


In [22]:
slim_response

Unnamed: 0,nationality,age,gender,food_one,food_two,food_three,food_four,art_gallery,library,museum,...,movie_theater,shop,park,natural_feature,place_of_worship,one,two,three,four,five
0,12,26-40,Female,4,4,1,0,2,2,2,...,2,4,4,4,4,68,111,20,138,138
1,12,15-25,Female,2,3,2,2,1,1,2,...,3,3,3,3,1,23,72,76,93,75
2,12,15-25,Female,2,3,2,1,3,2,3,...,1,4,4,4,3,122,75,41,103,111
3,12,26-40,Female,2,3,2,2,4,3,4,...,3,3,4,4,1,111,75,45,84,92
4,12,26-40,Female,3,3,1,1,2,3,3,...,3,2,4,4,1,39,117,71,93,129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,15-25,Male,2,3,1,1,2,4,3,...,1,2,3,3,4,83,129,122,138,138
149,12,26-40,Male,1,3,2,1,4,3,4,...,4,4,2,2,1,7,93,22,57,113
150,12,26-40,Male,2,3,1,1,3,2,3,...,1,1,3,4,2,15,76,60,138,138
151,12,26-40,Male,3,4,2,1,3,2,3,...,1,3,3,4,3,45,85,72,138,138


In [23]:
slim_response.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   nationality       153 non-null    int64 
 1   age               153 non-null    object
 2   gender            153 non-null    object
 3   food_one          153 non-null    int64 
 4   food_two          153 non-null    int64 
 5   food_three        153 non-null    int64 
 6   food_four         153 non-null    int64 
 7   art_gallery       153 non-null    int64 
 8   library           153 non-null    int64 
 9   museum            153 non-null    int64 
 10  aquarium          153 non-null    int64 
 11  amusement_park    153 non-null    int64 
 12  zoo               153 non-null    int64 
 13  movie_theater     153 non-null    int64 
 14  shop              153 non-null    int64 
 15  park              153 non-null    int64 
 16  natural_feature   153 non-null    int64 
 17  place_of_worship

In [24]:
dummy = pd.get_dummies(slim_response)


In [25]:
dummy.columns

Index(['nationality', 'food_one', 'food_two', 'food_three', 'food_four',
       'art_gallery', 'library', 'museum', 'aquarium', 'amusement_park', 'zoo',
       'movie_theater', 'shop', 'park', 'natural_feature', 'place_of_worship',
       'one', 'two', 'three', 'four', 'five', 'age_15-25', 'age_26-40',
       'age_41-55', 'age_55+', 'gender_Female', 'gender_Male'],
      dtype='object')

In [26]:
for c in user_response.columns:
    print(c)

nationality
age
gender
favorite_city_one
favorite_city_two
favorite_city_three
favorite_city_four
favorite_city_five
food_one
food_two
food_three
food_four
art_gallery
library
museum
aquarium
amusement_park
zoo
movie_theater
shop
park
natural_feature
place_of_worship
one
two
three
four
five


In [27]:
for c in dummy.columns:
    print(c)

nationality
food_one
food_two
food_three
food_four
art_gallery
library
museum
aquarium
amusement_park
zoo
movie_theater
shop
park
natural_feature
place_of_worship
one
two
three
four
five
age_15-25
age_26-40
age_41-55
age_55+
gender_Female
gender_Male


In [28]:
dummy


Unnamed: 0,nationality,food_one,food_two,food_three,food_four,art_gallery,library,museum,aquarium,amusement_park,...,two,three,four,five,age_15-25,age_26-40,age_41-55,age_55+,gender_Female,gender_Male
0,12,4,4,1,0,2,2,2,1,1,...,111,20,138,138,0,1,0,0,1,0
1,12,2,3,2,2,1,1,2,2,3,...,72,76,93,75,1,0,0,0,1,0
2,12,2,3,2,1,3,2,3,4,4,...,75,41,103,111,1,0,0,0,1,0
3,12,2,3,2,2,4,3,4,3,2,...,75,45,84,92,0,1,0,0,1,0
4,12,3,3,1,1,2,3,3,1,2,...,117,71,93,129,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,2,3,1,1,2,4,3,3,3,...,129,122,138,138,1,0,0,0,0,1
149,12,1,3,2,1,4,3,4,4,4,...,93,22,57,113,0,1,0,0,0,1
150,12,2,3,1,1,3,2,3,2,1,...,76,60,138,138,0,1,0,0,0,1
151,12,3,4,2,1,3,2,3,2,2,...,85,72,138,138,0,1,0,0,0,1


In [29]:
cosine_user = cosine_similarity(dummy)


In [30]:
# cosine does not have user demographics 
cosine_user = pd.DataFrame(cosine_user)
cosine_user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,143,144,145,146,147,148,149,150,151,152
0,1.000000,0.906416,0.939127,0.925607,0.949317,0.897894,0.665336,0.853609,0.818434,0.811177,...,0.830095,0.761325,0.924579,0.908951,0.991685,0.927896,0.927302,0.948040,0.964192,0.685717
1,0.906416,1.000000,0.857903,0.863406,0.964331,0.941654,0.810189,0.859679,0.934620,0.900747,...,0.922846,0.909674,0.972183,0.990188,0.919714,0.980553,0.876008,0.957036,0.973020,0.822234
2,0.939127,0.857903,1.000000,0.996179,0.890163,0.939022,0.784604,0.905388,0.835719,0.870645,...,0.855731,0.815119,0.929571,0.839704,0.964156,0.923568,0.805973,0.852455,0.911216,0.662635
3,0.925607,0.863406,0.996179,1.000000,0.895333,0.935215,0.826248,0.884811,0.857728,0.863925,...,0.846682,0.815382,0.928083,0.843030,0.959333,0.932144,0.801392,0.835651,0.901170,0.680586
4,0.949317,0.964331,0.890163,0.895333,1.000000,0.913258,0.808978,0.823104,0.862919,0.842220,...,0.870179,0.823551,0.954723,0.979551,0.958997,0.974951,0.961201,0.952228,0.970137,0.862691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,0.927896,0.980553,0.923568,0.932144,0.974951,0.964232,0.867451,0.880600,0.933619,0.911707,...,0.921044,0.902675,0.984755,0.973899,0.954420,1.000000,0.879184,0.933629,0.970271,0.831253
149,0.927302,0.876008,0.805973,0.801392,0.961201,0.797909,0.660771,0.717243,0.711131,0.708431,...,0.756374,0.672683,0.858449,0.917025,0.912365,0.879184,1.000000,0.921011,0.913722,0.825781
150,0.948040,0.957036,0.852455,0.835651,0.952228,0.920510,0.659762,0.886734,0.816824,0.878997,...,0.914109,0.857807,0.952554,0.968015,0.928928,0.933629,0.921011,1.000000,0.989882,0.783431
151,0.964192,0.973020,0.911216,0.901170,0.970137,0.959744,0.741294,0.916357,0.866837,0.913505,...,0.936018,0.890102,0.982694,0.975200,0.960902,0.970271,0.913722,0.989882,1.000000,0.795984


In [31]:
top_10_user = find_similar_n(cosine_user,10)
top_10_user

Unnamed: 0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
0,0,79,22,147,26,57,91,14,121,16
1,1,146,140,148,134,43,53,45,82,25
2,2,3,73,85,121,93,34,77,42,55
3,3,2,73,34,121,77,93,42,85,112
4,4,134,60,146,57,148,63,47,151,106
...,...,...,...,...,...,...,...,...,...,...
148,148,43,45,140,131,82,25,106,108,134
149,149,129,4,127,22,26,14,134,91,60
150,150,151,97,12,48,58,91,146,80,101
151,151,150,71,145,134,106,146,48,1,57


## Building city matrix


In [32]:
one = get_df('restaurants_one')
two = get_df('restaurants_two')
three = get_df('restaurants_three')
four = get_df('restaurants_four')
top_rest = get_df('restaurants')
all_food = pd.concat([one, two, three, four, top_rest], axis =0)
all_food.sample(5)

Unnamed: 0,country,city,name,address,price_level,rating,user_ratings_total,types,latitude,longitude,place_id,id
2258,Portugal,Lisbon,O Talho,"R. Carlos Testa 1, 1050-046 Lisboa, Portugal",3.0,4.3,762,"['restaurant', 'food', 'point_of_interest', 'e...",38.733435,-9.15242,ChIJ0XVltgwzGQ0RsKYN2IQ8sw8,63
451,India,Mumbai,Pondichery Cafe,"C 57, G Block BKC, Bandra Kurla Complex, Bandr...",4.0,4.3,653,"['cafe', 'bar', 'restaurant', 'food', 'point_o...",19.067895,72.86878,ChIJm_2Dr-7I5zsR_e3m9YIBQxU,14
4525,Argentina,Buenos Aires,Café Crespín,"Vera 699, C1414 AOM, Buenos Aires, Argentina",2.0,4.4,2758,"['restaurant', 'bakery', 'food', 'point_of_int...",-34.597266,-58.439199,ChIJ_2CsxHTKvJURLaDW-OTUCaI,85
7746,South Africa,Durban,New Little Gujarat Restaurant,"43 Dr Goonam St, Durban Central, Durban, 4001,...",1.0,4.0,128,"['restaurant', 'food', 'point_of_interest', 'e...",-29.854608,31.018599,ChIJg11-jsOp9x4RwLpGxm5ET5c,133
3331,Saudi Arabia,Dammam,Manoosha Alreef,"3956-3976 Khaleej Rd, Al-Hamra'a, Dammam 32422...",2.0,4.3,948,"['restaurant', 'food', 'point_of_interest', 'e...",26.459444,50.093611,ChIJU5urDw_8ST4RdOaGhpyP6LQ,64


In [33]:
all_food['price_level'] = all_food['price_level'].fillna(all_food.groupby('city')['price_level'].transform('mean'))
#Beirut and Tehran are still NA. fill na with 2?
all_food.fillna(2.0, inplace=True)

In [34]:
all_food['price_level'] = all_food['price_level'].astype(int)


In [35]:
attractions = get_df('attractions')
attractions.sample(5)

Unnamed: 0,country,city,name,address,price_level,rating,user_ratings_total,types,latitude,longitude,place_id,id
6060,Switzerland,Zurich,Lake promenade,"Mythenquai 301, 8038 Zürich, Switzerland",,4.7,142.0,"['park', 'tourist_attraction', 'point_of_inter...",47.350898,8.534494,ChIJPbEkxOEJkEcRhKwJoY3d0M8,104
1527,United States,Miami,Freedom Tower at Miami Dade College,"600 Biscayne Blvd, Miami, FL 33132, United States",,4.5,366.0,"['tourist_attraction', 'museum', 'point_of_int...",25.780331,-80.189378,ChIJc657J6C22YgRquWxgZhQ7uw,27
7982,Ecuador,Quito,Yaku Water Museum,"El Placer, Quito 170130, Ecuador",,4.4,3040.0,"['museum', 'tourist_attraction', 'point_of_int...",-0.2175,-78.518889,ChIJc6n0Hiua1ZEREu1p0HjiIUw,136
7906,Ghana,Accra,International Central Gospel Church,"69 Ring Road West, Accra, Ghana",,4.4,21.0,"['church', 'place_of_worship', 'point_of_inter...",5.566604,-0.223895,ChIJ____v-Gh3w8RDzVc0Iej91M,135
4991,Japan,Chiba,Tokyo Skytree,"1 Chome-1-2 Oshiage, Sumida City, Tokyo 131-86...",,4.4,54860.0,"['tourist_attraction', 'point_of_interest', 'e...",35.710063,139.8107,ChIJ35ov0dCOGGARKvdDH7NPHX0,86


In [36]:
attractions[attractions['city'] == 'Cancún']

Unnamed: 0,country,city,name,address,price_level,rating,user_ratings_total,types,latitude,longitude,place_id,id


In [37]:
attractions.shape

(8205, 12)

In [38]:
attractions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8205 entries, 0 to 8204
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             8205 non-null   object 
 1   city                8205 non-null   object 
 2   name                8205 non-null   object 
 3   address             8202 non-null   object 
 4   price_level         65 non-null     object 
 5   rating              8189 non-null   float64
 6   user_ratings_total  8189 non-null   float64
 7   types               8205 non-null   object 
 8   latitude            8145 non-null   float64
 9   longitude           8145 non-null   float64
 10  place_id            8205 non-null   object 
 11  id                  8205 non-null   object 
dtypes: float64(4), object(8)
memory usage: 769.3+ KB


In [39]:
def to_city(df):
    city_df = df.groupby(['country', 'city', 'id', 'price_level'])['name'].count().to_frame()
    price_level = city_df.pivot_table(index=['country', 'city', 'id'], columns='price_level', values='name', aggfunc='first')
    price_level['avg_price'] = df.groupby(['country', 'city', 'id'])['price_level'].mean()

    
    return price_level

In [40]:
food = to_city(all_food)
food

Unnamed: 0_level_0,Unnamed: 1_level_0,price_level,1,2,3,4,avg_price
country,city,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Argentina,Buenos Aires,85,6.0,95.0,68.0,31.0,2.620000
Australia,Melbourne,73,18.0,91.0,79.0,19.0,2.478261
Australia,Sydney,55,26.0,17.0,67.0,29.0,2.712230
Austria,Vienna,37,65.0,91.0,78.0,19.0,2.201581
Belgium,Brussels,60,10.0,103.0,44.0,18.0,2.400000
...,...,...,...,...,...,...,...
Uruguay,Montevideo,129,7.0,109.0,27.0,5.0,2.202703
Vietnam,Da Nang,90,54.0,78.0,8.0,,1.671429
Vietnam,Ha Long,48,67.0,23.0,1.0,,1.274725
Vietnam,Hanoi,52,53.0,81.0,15.0,1.0,1.760000


In [41]:
place_of_worship = ['place_of_worship', 'hindu_temple', 'church', 'mosque', 'synagogue']
shopping = ['store', 'shopping_mall', 'clothing_store', 'electronics_store', 'grocery_or_supermarket', 'department_store']

attractions_to_keep = ['amusement_park', 'museum', 'park', 'art_gallery', 'aquarium',
                      'zoo', 'library', 'movie_theater', 'natural_feature'] + place_of_worship + shopping

In [42]:
def main_call(attraction_df):
    attractions_split = split_types(attraction_df)
    dummy = dummies(attractions_split)
    by_city, all_attractions = attraction_count(dummy, attractions_split)
    
    return by_city, all_attractions

def split_types(df):
    df['split_types'] = [ast.literal_eval(x) for x in df.types]
    df['split_types_str'] = [','.join(x) for x in df.split_types]
    
    return df

def dummies(df):
    dummies = df.split_types_str.str.get_dummies(sep=',')

    return dummies

def attraction_count(dummies_df, all_attractions_df):
    all_attractions_df = pd.concat([all_attractions_df, dummies_df], axis=1)
    type_col_names = attractions_to_keep
    type_col_names.extend(['country', 'city', 'id'])
    attraction_count = all_attractions_df[type_col_names].groupby(['country', 'city', 'id']).sum()
    
    return attraction_count, all_attractions_df

In [43]:
city_group, all_attractions = main_call(attractions)

In [44]:
city_group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,hindu_temple,church,mosque,synagogue,store,shopping_mall,clothing_store,electronics_store,grocery_or_supermarket,department_store
country,city,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Argentina,Buenos Aires,85,1,16,17,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0
Australia,Melbourne,73,2,10,14,5,1,2,1,1,0,2,0,2,0,0,1,1,0,0,0,0
Australia,Sydney,55,1,8,15,2,1,2,0,0,1,1,0,1,0,0,1,0,0,0,0,0
Austria,Vienna,37,1,25,7,0,1,3,1,0,0,3,0,3,0,0,1,0,0,0,1,0
Belgium,Brussels,60,2,20,9,0,0,0,1,0,0,4,0,4,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Uruguay,Montevideo,129,1,18,9,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0
Vietnam,Da Nang,90,1,7,5,0,0,0,0,0,0,11,0,3,0,0,1,0,0,0,0,0
Vietnam,Ha Long,48,1,2,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
Vietnam,Hanoi,52,0,11,6,0,0,1,0,0,0,7,0,2,0,0,0,0,0,0,0,0


#### Adding columns for worship and stores

In [45]:
city_group['place_of_worship2'] = city_group['place_of_worship'] + city_group['hindu_temple'] + city_group['church'] + city_group['mosque'] + city_group['synagogue']
city_group['store2'] = city_group['store'] + city_group['shopping_mall'] + city_group['clothing_store'] + city_group['electronics_store'] + city_group['grocery_or_supermarket'] + city_group['department_store']


In [46]:
city_group.drop(columns=['place_of_worship', 'hindu_temple', 'church', 'mosque', 'synagogue', 'department_store',
                        'grocery_or_supermarket', 'electronics_store', 'clothing_store', 'shopping_mall', 'store'], inplace=True)
city_group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship2,store2
country,city,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Argentina,Buenos Aires,85,1,16,17,1,0,1,0,0,0,2,1
Australia,Melbourne,73,2,10,14,5,1,2,1,1,0,4,2
Australia,Sydney,55,1,8,15,2,1,2,0,0,1,2,1
Austria,Vienna,37,1,25,7,0,1,3,1,0,0,6,2
Belgium,Brussels,60,2,20,9,0,0,0,1,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Uruguay,Montevideo,129,1,18,9,0,0,0,1,0,0,2,1
Vietnam,Da Nang,90,1,7,5,0,0,0,0,0,0,14,1
Vietnam,Ha Long,48,1,2,1,0,0,0,0,0,1,0,1
Vietnam,Hanoi,52,0,11,6,0,0,1,0,0,0,9,0


In [47]:
city_group.rename(columns={"place_of_worship2" : 'place_of_worship', 'store2': 'store'}, inplace=True)

In [48]:
city_group.reset_index(inplace=True)

In [49]:
city_group

Unnamed: 0,country,city,id,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
0,Argentina,Buenos Aires,85,1,16,17,1,0,1,0,0,0,2,1
1,Australia,Melbourne,73,2,10,14,5,1,2,1,1,0,4,2
2,Australia,Sydney,55,1,8,15,2,1,2,0,0,1,2,1
3,Austria,Vienna,37,1,25,7,0,1,3,1,0,0,6,2
4,Belgium,Brussels,60,2,20,9,0,0,0,1,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,Uruguay,Montevideo,129,1,18,9,0,0,0,1,0,0,2,1
134,Vietnam,Da Nang,90,1,7,5,0,0,0,0,0,0,14,1
135,Vietnam,Ha Long,48,1,2,1,0,0,0,0,0,1,0,1
136,Vietnam,Hanoi,52,0,11,6,0,0,1,0,0,0,9,0


In [50]:
city_group = city_group.drop(city_group[city_group.city == 'Guilin'].index)

In [51]:
city_group.set_index(['country', 'city', 'id'], inplace=True)

In [52]:
def attraction_food(attraction_df, food_df):
    city = pd.merge(attraction_df, food_df, on = ['country', 'city', 'id'], how = 'outer')
    city.reset_index(inplace=True)
    city['id'] = pd.to_numeric(city['id'])
    city.set_index(['country', 'city', 'id'], inplace=True)
    city.sort_values('id', inplace=True)
    return city

In [53]:
food.fillna(0.0, inplace=True)

In [54]:
food

Unnamed: 0_level_0,Unnamed: 1_level_0,price_level,1,2,3,4,avg_price
country,city,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Argentina,Buenos Aires,85,6.0,95.0,68.0,31.0,2.620000
Australia,Melbourne,73,18.0,91.0,79.0,19.0,2.478261
Australia,Sydney,55,26.0,17.0,67.0,29.0,2.712230
Austria,Vienna,37,65.0,91.0,78.0,19.0,2.201581
Belgium,Brussels,60,10.0,103.0,44.0,18.0,2.400000
...,...,...,...,...,...,...,...
Uruguay,Montevideo,129,7.0,109.0,27.0,5.0,2.202703
Vietnam,Da Nang,90,54.0,78.0,8.0,0.0,1.671429
Vietnam,Ha Long,48,67.0,23.0,1.0,0.0,1.274725
Vietnam,Hanoi,52,53.0,81.0,15.0,1.0,1.760000


In [55]:
city_group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
country,city,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Argentina,Buenos Aires,85,1,16,17,1,0,1,0,0,0,2,1
Australia,Melbourne,73,2,10,14,5,1,2,1,1,0,4,2
Australia,Sydney,55,1,8,15,2,1,2,0,0,1,2,1
Austria,Vienna,37,1,25,7,0,1,3,1,0,0,6,2
Belgium,Brussels,60,2,20,9,0,0,0,1,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Uruguay,Montevideo,129,1,18,9,0,0,0,1,0,0,2,1
Vietnam,Da Nang,90,1,7,5,0,0,0,0,0,0,14,1
Vietnam,Ha Long,48,1,2,1,0,0,0,0,0,1,0,1
Vietnam,Hanoi,52,0,11,6,0,0,1,0,0,0,9,0


In [56]:
city = attraction_food(food, city_group)

In [57]:
city.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 138 entries, ('Hong Kong', 'Hong Kong', 1) to ('Mexico', 'Cancun', 140)
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   1                 138 non-null    float64
 1   2                 138 non-null    float64
 2   3                 138 non-null    float64
 3   4                 138 non-null    float64
 4   avg_price         138 non-null    float64
 5   amusement_park    138 non-null    int64  
 6   museum            138 non-null    int64  
 7   park              138 non-null    int64  
 8   art_gallery       138 non-null    int64  
 9   aquarium          138 non-null    int64  
 10  zoo               138 non-null    int64  
 11  library           138 non-null    int64  
 12  movie_theater     138 non-null    int64  
 13  natural_feature   138 non-null    int64  
 14  place_of_worship  138 non-null    int64  
 15  store             138 non-null    int64  

In [58]:
city

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1,2,3,4,avg_price,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
country,city,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,2.600000,4,7,5,1,0,0,0,0,0,4,0
Thailand,Bangkok,2,62.0,100.0,70.0,39.0,2.317343,1,13,10,2,1,1,0,0,0,18,1
United Kingdom,London,3,62.0,88.0,77.0,73.0,2.536667,0,19,8,4,0,1,1,1,0,4,4
Macau,Macau,4,0.0,89.0,12.0,4.0,2.190476,2,10,6,1,0,1,0,0,0,17,0
Singapore,Singapore,5,11.0,80.0,80.0,60.0,2.818182,3,7,16,0,1,3,0,0,0,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ecuador,Quito,136,14.0,98.0,37.0,3.0,2.190789,2,13,18,1,0,0,0,0,0,15,0
China,Tianjin,137,1.0,56.0,5.0,2.0,2.125000,1,10,13,0,1,1,0,0,0,4,0
China,Qingdao,138,0.0,36.0,2.0,0.0,2.052632,0,2,6,0,2,1,0,0,0,2,1
United States,Philadelphia,139,26.0,90.0,78.0,22.0,2.444444,1,18,17,0,1,3,1,0,0,2,4


In [59]:
cosine_city = pd.DataFrame(cosine_similarity(city))

cosine_city

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,128,129,130,131,132,133,134,135,136,137
0,1.000000,0.937156,0.990289,0.619386,0.922609,0.923573,0.892307,0.993382,0.885670,0.795087,...,0.566683,0.873375,0.649685,0.922077,0.655939,0.725983,0.598632,0.564575,0.864500,0.709995
1,0.937156,1.000000,0.965247,0.780201,0.903792,0.906812,0.911072,0.917226,0.988012,0.926659,...,0.732571,0.981234,0.816743,0.979259,0.816168,0.888319,0.756808,0.728076,0.951479,0.850415
2,0.990289,0.965247,1.000000,0.664364,0.935348,0.937784,0.892121,0.983640,0.927751,0.845507,...,0.612802,0.913192,0.694090,0.952390,0.696821,0.781281,0.649877,0.609575,0.914035,0.746902
3,0.619386,0.780201,0.664364,1.000000,0.714834,0.764579,0.786530,0.555930,0.798232,0.923208,...,0.988715,0.762228,0.961216,0.693601,0.991471,0.961377,0.977588,0.978619,0.801978,0.978743
4,0.922609,0.903792,0.935348,0.714834,1.000000,0.987728,0.777578,0.928143,0.881652,0.889080,...,0.649477,0.864000,0.664652,0.906378,0.725442,0.817387,0.696157,0.663327,0.942281,0.769738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0.725983,0.888319,0.781281,0.961377,0.817387,0.837712,0.822741,0.684009,0.915393,0.981122,...,0.938196,0.888806,0.948266,0.846047,0.963405,1.000000,0.954304,0.939022,0.921275,0.972872
134,0.598632,0.756808,0.649877,0.977588,0.696157,0.741233,0.790744,0.538674,0.782286,0.893100,...,0.986593,0.733188,0.964290,0.689065,0.956687,0.954304,1.000000,0.988544,0.795840,0.961623
135,0.564575,0.728076,0.609575,0.978619,0.663327,0.703279,0.775787,0.501178,0.759576,0.876879,...,0.991330,0.703082,0.966554,0.654243,0.968198,0.939022,0.988544,1.000000,0.763202,0.968942
136,0.864500,0.951479,0.914035,0.801978,0.942281,0.930930,0.802283,0.857842,0.968372,0.963051,...,0.755329,0.942794,0.791928,0.955666,0.818327,0.921275,0.795840,0.763202,1.000000,0.857581


In [60]:
top_10_city = find_similar_n(cosine_city, 10)
top_10_city

Unnamed: 0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
0,0,7,13,100,2,33,70,51,104,16
1,1,34,15,69,26,8,54,45,61,106
2,2,100,33,0,104,70,13,51,7,103
3,3,132,37,66,116,128,11,31,20,56
4,4,16,5,95,102,67,82,98,18,71
...,...,...,...,...,...,...,...,...,...,...
133,133,29,38,90,105,124,118,108,59,58
134,134,120,12,11,20,135,125,83,128,117
135,135,125,20,12,120,11,128,92,31,134
136,136,71,18,67,102,101,72,60,115,85


### All user and city matrix completed

In [61]:
def select_favorite():
    for user in user_response.iloc[k]:
        one =  city.iloc[city.index.get_level_values('city') == user_response.iloc[k]['favorite_city_one']]
        two =  city.iloc[city.index.get_level_values('city') == user_response.iloc[k]['favorite_city_two']]
        three =  city.iloc[city.index.get_level_values('city') == user_response.iloc[k]['favorite_city_three']]
        top = pd.concat([one, two, three])
        top = top.stack().to_frame().reset_index(drop=True).T

In [62]:
test = user_response.iloc[0:3].copy()
test

Unnamed: 0,nationality,age,gender,favorite_city_one,favorite_city_two,favorite_city_three,favorite_city_four,favorite_city_five,food_one,food_two,...,movie_theater,shop,park,natural_feature,place_of_worship,one,two,three,four,five
0,12,26-40,Female,Krabi,Rome,Budapest,Zx,Zx,Always,Always,...,2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),68,111,20,138,138
1,12,15-25,Female,Cancun,Las Vegas,Los Angeles,New York City,London,Sometimes,Often,...,3,3,3,3,1 ( Would NOT go),23,72,76,93,75
2,12,15-25,Female,Sydney,London,Dublin,Prague,Rome,Sometimes,Often,...,1 ( Would NOT go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),3,122,75,41,103,111


In [63]:
user_response.head()

Unnamed: 0,nationality,age,gender,favorite_city_one,favorite_city_two,favorite_city_three,favorite_city_four,favorite_city_five,food_one,food_two,...,movie_theater,shop,park,natural_feature,place_of_worship,one,two,three,four,five
0,12,26-40,Female,Krabi,Rome,Budapest,Zx,Zx,Always,Always,...,2,4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),68,111,20,138,138
1,12,15-25,Female,Cancun,Las Vegas,Los Angeles,New York City,London,Sometimes,Often,...,3,3,3,3,1 ( Would NOT go),23,72,76,93,75
2,12,15-25,Female,Sydney,London,Dublin,Prague,Rome,Sometimes,Often,...,1 ( Would NOT go),4 (Definitely would go),4 (Definitely would go),4 (Definitely would go),3,122,75,41,103,111
3,12,26-40,Female,Rome,London,Florence,Mexico City,Munich,Sometimes,Often,...,3,3,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),111,75,45,84,92
4,12,26-40,Female,Denpasar,Shanghai,Kyoto,New York City,Vancouver,Often,Often,...,3,2,4 (Definitely would go),4 (Definitely would go),1 ( Would NOT go),39,117,71,93,129


In [64]:

def build_top_city(city_matrix, user_matrix):
    df = pd.DataFrame([])
    k = 0
    while k < len(user_matrix):
        one =  city_matrix.iloc[city_matrix.index.get_level_values('city') == user_matrix.iloc[k]['favorite_city_one']]
        one.reset_index(inplace=True)
        one = one.add_suffix('_1')
        two =  city_matrix.iloc[city_matrix.index.get_level_values('city') == user_matrix.iloc[k]['favorite_city_two']]
        two.reset_index(inplace=True)
        two = two.add_suffix('_2')
        three =  city_matrix.iloc[city_matrix.index.get_level_values('city') == user_matrix.iloc[k]['favorite_city_three']]
        three.reset_index(inplace=True)
        three = three.add_suffix('_3')
        top = pd.concat([one, two, three])
        top.drop(columns=['country_1', 'city_1', 'country_2', 'city_2', 'country_3', 'city_3'], inplace=True)
        top = top.stack().to_frame().T
        top.columns = top.columns.droplevel()
        df = df.append(top)
        df.reset_index(drop=True, inplace=True)

        k += 1
        
    return df
    
    
def build_design_matrix(dummy_response, top_cities):
    finished = pd.concat([dummy_response, top_cities], axis=1)
    return finished

def find_top_city(city_matrix, user_matrix):
    k = 0
    i = 3
    j = '1'
    df = pd.DataFrame([])
    for k in range(len(user_matrix)):
        while i < 6:
            one =  city_matrix.iloc[city_matrix.index.get_level_values('city') == user_matrix.iloc[k][i]]
            one.reset_index(inplace=True)  
            one = one.add_suffix('_'+j)
            i += 1
        
    return one


    
    


In [65]:
top_3 = build_top_city(city, user_response)
top_3

Unnamed: 0,id_1,1_1,2_1,3_1,4_1,avg_price_1,amusement_park_1,museum_1,park_1,art_gallery_1,...,museum_3,park_3,art_gallery_3,aquarium_3,zoo_3,library_3,movie_theater_3,natural_feature_3,place_of_worship_3,store_3
0,99.0,64.0,27.0,1.0,0.0,1.315217,1.0,3.0,8.0,0.0,...,10.0,12.0,1.0,1.0,1.0,0.0,0.0,0.0,6.0,2.0
1,140.0,14.0,108.0,22.0,8.0,2.157895,7.0,3.0,6.0,0.0,...,17.0,11.0,3.0,0.0,1.0,1.0,0.0,0.0,6.0,2.0
2,55.0,26.0,17.0,67.0,29.0,2.712230,1.0,8.0,15.0,2.0,...,19.0,5.0,3.0,0.0,1.0,1.0,0.0,0.0,8.0,2.0
3,16.0,69.0,94.0,70.0,24.0,2.190661,0.0,17.0,7.0,2.0,...,20.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,27.0,3.0
4,32.0,8.0,112.0,9.0,0.0,2.007752,3.0,5.0,8.0,0.0,...,5.0,7.0,0.0,1.0,2.0,0.0,0.0,0.0,29.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,73.0,18.0,91.0,79.0,19.0,2.478261,2.0,10.0,14.0,5.0,...,8.0,15.0,2.0,1.0,2.0,0.0,0.0,1.0,2.0,1.0
149,126.0,65.0,84.0,85.0,30.0,2.303030,1.0,16.0,18.0,0.0,...,16.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0.0
150,41.0,67.0,96.0,71.0,18.0,2.158730,0.0,23.0,8.0,3.0,...,15.0,9.0,0.0,1.0,0.0,0.0,0.0,0.0,20.0,2.0
151,51.0,79.0,91.0,22.0,9.0,1.805970,0.0,20.0,4.0,1.0,...,7.0,15.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,7.0


In [66]:
dummy

Unnamed: 0,nationality,food_one,food_two,food_three,food_four,art_gallery,library,museum,aquarium,amusement_park,...,two,three,four,five,age_15-25,age_26-40,age_41-55,age_55+,gender_Female,gender_Male
0,12,4,4,1,0,2,2,2,1,1,...,111,20,138,138,0,1,0,0,1,0
1,12,2,3,2,2,1,1,2,2,3,...,72,76,93,75,1,0,0,0,1,0
2,12,2,3,2,1,3,2,3,4,4,...,75,41,103,111,1,0,0,0,1,0
3,12,2,3,2,2,4,3,4,3,2,...,75,45,84,92,0,1,0,0,1,0
4,12,3,3,1,1,2,3,3,1,2,...,117,71,93,129,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,2,3,1,1,2,4,3,3,3,...,129,122,138,138,1,0,0,0,0,1
149,12,1,3,2,1,4,3,4,4,4,...,93,22,57,113,0,1,0,0,0,1
150,12,2,3,1,1,3,2,3,2,1,...,76,60,138,138,0,1,0,0,0,1
151,12,3,4,2,1,3,2,3,2,2,...,85,72,138,138,0,1,0,0,0,1


In [67]:
matrix = build_design_matrix(dummy, top_3)
matrix

Unnamed: 0,nationality,food_one,food_two,food_three,food_four,art_gallery,library,museum,aquarium,amusement_park,...,museum_3,park_3,art_gallery_3,aquarium_3,zoo_3,library_3,movie_theater_3,natural_feature_3,place_of_worship_3,store_3
0,12,4,4,1,0,2,2,2,1,1,...,10.0,12.0,1.0,1.0,1.0,0.0,0.0,0.0,6.0,2.0
1,12,2,3,2,2,1,1,2,2,3,...,17.0,11.0,3.0,0.0,1.0,1.0,0.0,0.0,6.0,2.0
2,12,2,3,2,1,3,2,3,4,4,...,19.0,5.0,3.0,0.0,1.0,1.0,0.0,0.0,8.0,2.0
3,12,2,3,2,2,4,3,4,3,2,...,20.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,27.0,3.0
4,12,3,3,1,1,2,3,3,1,2,...,5.0,7.0,0.0,1.0,2.0,0.0,0.0,0.0,29.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,2,3,1,1,2,4,3,3,3,...,8.0,15.0,2.0,1.0,2.0,0.0,0.0,1.0,2.0,1.0
149,12,1,3,2,1,4,3,4,4,4,...,16.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0.0
150,12,2,3,1,1,3,2,3,2,1,...,15.0,9.0,0.0,1.0,0.0,0.0,0.0,0.0,20.0,2.0
151,12,3,4,2,1,3,2,3,2,2,...,7.0,15.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,7.0


In [68]:
matrix.iloc[:, 485:]

0
1
2
3
4
...
148
149
150
151
152


In [69]:
matrix.isnull().any()

nationality           False
food_one              False
food_two              False
food_three            False
food_four             False
                      ...  
library_3              True
movie_theater_3        True
natural_feature_3      True
place_of_worship_3     True
store_3                True
Length: 78, dtype: bool

In [70]:
matrix[matrix['shopping_mall_3'].isnull()]

KeyError: 'shopping_mall_3'

In [None]:
matrix.drop([9, 17, 25, 33, 40, 43, 45, 59, 82, 140], inplace=True)

In [None]:
# Need to remove country/city in the top three matrix before appending.
cos = cosine_similarity(matrix)
cos = pd.DataFrame(cos)

In [None]:
#This will create a df of any null values
nan_rows = matrix[matrix.isnull().T.any().T]

In [None]:
nan_rows

In [None]:
cos

In [None]:
bla = find_similar_n(cos, 10)
bla

### Design matrix is complete.

Each row is a user, with the survey response and their top 3 countries info (food, price, attractions)

In [None]:
matrix.sample(3)

In [None]:
y = matrix.id_3

In [None]:
full_df = matrix.copy()

In [None]:
df = matrix.copy()

In [None]:
dropthat = df.iloc[:, 487:].columns

In [None]:
df.drop(columns=dropthat, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

In [None]:
dropthat

In [None]:
from scipy import linalg
U, s, Vh = linalg.svd(full_df)

In [None]:
full_df

### KNN is not effective really low prediction score.

Best k = 1. 
Best k = 7? 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k = 4
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(df,y)
neigh

In [None]:
yhat = neigh.predict(X_test)
yhat[0:5]

In [None]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

In [None]:
y_train

In [None]:
# write your code here
k2 = 6
new = KNeighborsClassifier(n_neighbors=k2)
new.fit(X_train, y_train)

yhat2 = neigh.predict(X_test)

from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, new.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat2))

In [None]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

In [None]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

### Creating userxcity matrix with dummy values

In [71]:
user = dummy

In [72]:
user

Unnamed: 0,nationality,food_one,food_two,food_three,food_four,art_gallery,library,museum,aquarium,amusement_park,...,two,three,four,five,age_15-25,age_26-40,age_41-55,age_55+,gender_Female,gender_Male
0,12,4,4,1,0,2,2,2,1,1,...,111,20,138,138,0,1,0,0,1,0
1,12,2,3,2,2,1,1,2,2,3,...,72,76,93,75,1,0,0,0,1,0
2,12,2,3,2,1,3,2,3,4,4,...,75,41,103,111,1,0,0,0,1,0
3,12,2,3,2,2,4,3,4,3,2,...,75,45,84,92,0,1,0,0,1,0
4,12,3,3,1,1,2,3,3,1,2,...,117,71,93,129,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,2,3,1,1,2,4,3,3,3,...,129,122,138,138,1,0,0,0,0,1
149,12,1,3,2,1,4,3,4,4,4,...,93,22,57,113,0,1,0,0,0,1
150,12,2,3,1,1,3,2,3,2,1,...,76,60,138,138,0,1,0,0,0,1
151,12,3,4,2,1,3,2,3,2,2,...,85,72,138,138,0,1,0,0,0,1


In [73]:
city

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1,2,3,4,avg_price,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
country,city,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,2.600000,4,7,5,1,0,0,0,0,0,4,0
Thailand,Bangkok,2,62.0,100.0,70.0,39.0,2.317343,1,13,10,2,1,1,0,0,0,18,1
United Kingdom,London,3,62.0,88.0,77.0,73.0,2.536667,0,19,8,4,0,1,1,1,0,4,4
Macau,Macau,4,0.0,89.0,12.0,4.0,2.190476,2,10,6,1,0,1,0,0,0,17,0
Singapore,Singapore,5,11.0,80.0,80.0,60.0,2.818182,3,7,16,0,1,3,0,0,0,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ecuador,Quito,136,14.0,98.0,37.0,3.0,2.190789,2,13,18,1,0,0,0,0,0,15,0
China,Tianjin,137,1.0,56.0,5.0,2.0,2.125000,1,10,13,0,1,1,0,0,0,4,0
China,Qingdao,138,0.0,36.0,2.0,0.0,2.052632,0,2,6,0,2,1,0,0,0,2,1
United States,Philadelphia,139,26.0,90.0,78.0,22.0,2.444444,1,18,17,0,1,3,1,0,0,2,4


In [74]:
city_ = city.drop(columns=['avg_price'])

In [75]:
user

Unnamed: 0,nationality,food_one,food_two,food_three,food_four,art_gallery,library,museum,aquarium,amusement_park,...,two,three,four,five,age_15-25,age_26-40,age_41-55,age_55+,gender_Female,gender_Male
0,12,4,4,1,0,2,2,2,1,1,...,111,20,138,138,0,1,0,0,1,0
1,12,2,3,2,2,1,1,2,2,3,...,72,76,93,75,1,0,0,0,1,0
2,12,2,3,2,1,3,2,3,4,4,...,75,41,103,111,1,0,0,0,1,0
3,12,2,3,2,2,4,3,4,3,2,...,75,45,84,92,0,1,0,0,1,0
4,12,3,3,1,1,2,3,3,1,2,...,117,71,93,129,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,2,2,3,1,1,2,4,3,3,3,...,129,122,138,138,1,0,0,0,0,1
149,12,1,3,2,1,4,3,4,4,4,...,93,22,57,113,0,1,0,0,0,1
150,12,2,3,1,1,3,2,3,2,1,...,76,60,138,138,0,1,0,0,0,1
151,12,3,4,2,1,3,2,3,2,2,...,85,72,138,138,0,1,0,0,0,1


In [76]:
city_.reset_index(inplace=True)

In [77]:
city_[city_['city'] == 'Cancun']

Unnamed: 0,country,city,id,1,2,3,4,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store
137,Mexico,Cancun,140,14.0,108.0,22.0,8.0,7,3,6,0,2,0,0,0,0,8,0


In [78]:
city_ = city_.drop(city_[city_.city == 'Cancún'].index)

In [79]:
city_['label_id'] = le.transform(city_['city'])



In [81]:
city_

Unnamed: 0,country,city,id,1,2,3,4,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store,label_id
0,Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,4,7,5,1,0,0,0,0,0,4,0,56
1,Thailand,Bangkok,2,62.0,100.0,70.0,39.0,1,13,10,2,1,1,0,0,0,18,1,10
2,United Kingdom,London,3,62.0,88.0,77.0,73.0,0,19,8,4,0,1,1,1,0,4,4,75
3,Macau,Macau,4,0.0,89.0,12.0,4.0,2,10,6,1,0,1,0,0,0,17,0,77
4,Singapore,Singapore,5,11.0,80.0,80.0,60.0,3,7,16,0,1,3,0,0,0,4,2,119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,Ecuador,Quito,136,14.0,98.0,37.0,3.0,2,13,18,1,0,0,0,0,0,15,0,107
134,China,Tianjin,137,1.0,56.0,5.0,2.0,1,10,13,0,1,1,0,0,0,4,0,126
135,China,Qingdao,138,0.0,36.0,2.0,0.0,0,2,6,0,2,1,0,0,0,2,1,106
136,United States,Philadelphia,139,26.0,90.0,78.0,22.0,1,18,17,0,1,3,1,0,0,2,4,100


In [82]:
user.reset_index(drop=False, inplace=True)


In [83]:
city_['key'] = 1
user['key'] = 1



In [84]:
user

Unnamed: 0,index,nationality,food_one,food_two,food_three,food_four,art_gallery,library,museum,aquarium,...,three,four,five,age_15-25,age_26-40,age_41-55,age_55+,gender_Female,gender_Male,key
0,0,12,4,4,1,0,2,2,2,1,...,20,138,138,0,1,0,0,1,0,1
1,1,12,2,3,2,2,1,1,2,2,...,76,93,75,1,0,0,0,1,0,1
2,2,12,2,3,2,1,3,2,3,4,...,41,103,111,1,0,0,0,1,0,1
3,3,12,2,3,2,2,4,3,4,3,...,45,84,92,0,1,0,0,1,0,1
4,4,12,3,3,1,1,2,3,3,1,...,71,93,129,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,148,2,2,3,1,1,2,4,3,3,...,122,138,138,1,0,0,0,0,1,1
149,149,12,1,3,2,1,4,3,4,4,...,22,57,113,0,1,0,0,0,1,1
150,150,12,2,3,1,1,3,2,3,2,...,60,138,138,0,1,0,0,0,1,1
151,151,12,3,4,2,1,3,2,3,2,...,72,138,138,0,1,0,0,0,1,1


In [87]:
city_

Unnamed: 0,country,city,id,1,2,3,4,amusement_park,museum,park,art_gallery,aquarium,zoo,library,movie_theater,natural_feature,place_of_worship,store,label_id,key
0,Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,4,7,5,1,0,0,0,0,0,4,0,56,1
1,Thailand,Bangkok,2,62.0,100.0,70.0,39.0,1,13,10,2,1,1,0,0,0,18,1,10,1
2,United Kingdom,London,3,62.0,88.0,77.0,73.0,0,19,8,4,0,1,1,1,0,4,4,75,1
3,Macau,Macau,4,0.0,89.0,12.0,4.0,2,10,6,1,0,1,0,0,0,17,0,77,1
4,Singapore,Singapore,5,11.0,80.0,80.0,60.0,3,7,16,0,1,3,0,0,0,4,2,119,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,Ecuador,Quito,136,14.0,98.0,37.0,3.0,2,13,18,1,0,0,0,0,0,15,0,107,1
134,China,Tianjin,137,1.0,56.0,5.0,2.0,1,10,13,0,1,1,0,0,0,4,0,126,1
135,China,Qingdao,138,0.0,36.0,2.0,0.0,0,2,6,0,2,1,0,0,0,2,1,106,1
136,United States,Philadelphia,139,26.0,90.0,78.0,22.0,1,18,17,0,1,3,1,0,0,2,4,100,1


In [85]:
df3 = pd.merge(city_,user,on='key').drop('key',axis=1)


In [92]:
df3

Unnamed: 0,country,city,id,1,2,3,4,amusement_park_x,museum_x,park_x,...,two,three,four,five,age_15-25,age_26-40,age_41-55,age_55+,gender_Female,gender_Male
0,Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,4,7,5,...,111,20,138,138,0,1,0,0,1,0
1,Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,4,7,5,...,72,76,93,75,1,0,0,0,1,0
2,Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,4,7,5,...,75,41,103,111,1,0,0,0,1,0
3,Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,4,7,5,...,75,45,84,92,0,1,0,0,1,0
4,Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,4,7,5,...,117,71,93,129,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21109,Mexico,Cancun,140,14.0,108.0,22.0,8.0,7,3,6,...,129,122,138,138,1,0,0,0,0,1
21110,Mexico,Cancun,140,14.0,108.0,22.0,8.0,7,3,6,...,93,22,57,113,0,1,0,0,0,1
21111,Mexico,Cancun,140,14.0,108.0,22.0,8.0,7,3,6,...,76,60,138,138,0,1,0,0,0,1
21112,Mexico,Cancun,140,14.0,108.0,22.0,8.0,7,3,6,...,85,72,138,138,0,1,0,0,0,1


In [91]:

df3.sort_values('index')

Unnamed: 0,country,city,id,1,2,3,4,amusement_park_x,museum_x,park_x,...,two,three,four,five,age_15-25,age_26-40,age_41-55,age_55+,gender_Female,gender_Male
0,Hong Kong,Hong Kong,1,62.0,82.0,70.0,86.0,4,7,5,...,111,20,138,138,0,1,0,0,1,0
15147,Dominican Republic,Punta Cana,102,12.0,91.0,11.0,2.0,7,2,2,...,111,20,138,138,0,1,0,0,1,0
14994,Mexico,Mexico City,101,16.0,60.0,60.0,33.0,2,28,8,...,111,20,138,138,0,1,0,0,1,0
14841,India,Bangalore,100,62.0,80.0,85.0,45.0,4,8,12,...,111,20,138,138,0,1,0,0,1,0
14688,Thailand,Krabi,99,64.0,27.0,1.0,0.0,1,3,8,...,111,20,138,138,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13769,France,Nice,92,14.0,107.0,18.0,2.0,0,11,15,...,60,97,11,113,0,1,0,0,0,1
13616,Indonesia,Batam,91,31.0,98.0,5.0,0.0,3,3,11,...,60,97,11,113,0,1,0,0,0,1
13463,Vietnam,Da Nang,90,54.0,78.0,8.0,0.0,1,7,5,...,60,97,11,113,0,1,0,0,0,1
14381,Portugal,Porto,96,85.0,86.0,25.0,9.0,0,10,6,...,60,97,11,113,0,1,0,0,0,1


In [None]:
user

In [93]:
df3.drop(columns=['country', 'id'], inplace=True)

In [94]:
df3.set_index(['city', 'label_id', 'index'], inplace=True)

In [95]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1,2,3,4,amusement_park_x,museum_x,park_x,art_gallery_x,aquarium_x,zoo_x,...,two,three,four,five,age_15-25,age_26-40,age_41-55,age_55+,gender_Female,gender_Male
city,label_id,index,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Hong Kong,56,0,62.0,82.0,70.0,86.0,4,7,5,1,0,0,...,111,20,138,138,0,1,0,0,1,0
Hong Kong,56,1,62.0,82.0,70.0,86.0,4,7,5,1,0,0,...,72,76,93,75,1,0,0,0,1,0
Hong Kong,56,2,62.0,82.0,70.0,86.0,4,7,5,1,0,0,...,75,41,103,111,1,0,0,0,1,0
Hong Kong,56,3,62.0,82.0,70.0,86.0,4,7,5,1,0,0,...,75,45,84,92,0,1,0,0,1,0
Hong Kong,56,4,62.0,82.0,70.0,86.0,4,7,5,1,0,0,...,117,71,93,129,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cancun,23,148,14.0,108.0,22.0,8.0,7,3,6,0,2,0,...,129,122,138,138,1,0,0,0,0,1
Cancun,23,149,14.0,108.0,22.0,8.0,7,3,6,0,2,0,...,93,22,57,113,0,1,0,0,0,1
Cancun,23,150,14.0,108.0,22.0,8.0,7,3,6,0,2,0,...,76,60,138,138,0,1,0,0,0,1
Cancun,23,151,14.0,108.0,22.0,8.0,7,3,6,0,2,0,...,85,72,138,138,0,1,0,0,0,1


In [None]:
for c in df3.columns:
    print(c)

In [None]:
df3.iloc[:, 16:25]

In [None]:
df3.reset_index(level=1, drop=False, inplace=True)

In [None]:
def add_rank(x):
    
    if x.one == x.label_id:
        return 1
    if x.two == x.label_id:
        return 1
    if x.three == x.label_id:
        return 1
    if x.four == x.label_id:
        return 1
    if x.five == x.label_id:
        return 1
    else:
        return 0
    
    
    

In [None]:
df3['rank'] = df3.apply(add_rank,axis=1)


In [None]:
df3.reset_index(drop=False, inplace=True)

df3.rename(columns={'index': 'user'}, inplace=True)

In [None]:
df3

In [None]:
df3.columns

In [None]:
df3.sort_values(['user', 'city'], inplace=True)


In [None]:
df3

In [None]:
df3.set_index(['user', 'city', 'label_id'], inplace=True)

In [None]:
df3

In [None]:
df3.to_csv(r'/Users/tristannisbet/Documents/SM/Dataframe/new/user_city_rank.csv')

In [None]:
test = df3[df3['rank'] ==1].copy()

In [None]:
test.iloc[:, 16:]

In [None]:
test