In [None]:
import pandas as pd
import numpy as np
import sqlite3
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import preprocessing as pp
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def get_df(table_name):
    try:
        conn = sqlite3.connect('/Users/tristannisbet/Documents/travel_app/places.db')

    except Exception as e:
        print('Error durring connection: ', str(e))
    
    sql = """select * from {}""".format(table_name)
    df = pd.read_sql_query(sql, conn)

    return df

In [None]:
one = get_df('restaurants_one')
two = get_df('restaurants_two')
three = get_df('restaurants_three')
four = get_df('restaurants_four')
top_rest = get_df('restaurants')


In [None]:
all_price = pd.concat([one, two, three, four, top_rest], axis =0)
all_price.info()

In [None]:
all_price['id'] = pd.to_numeric(all_price.id)

In [None]:
all_price.price_level.fillna(0, inplace=True)

In [None]:
all_price.price_level.isnull().any()

In [None]:
all_price.isnull().any()

In [None]:
def to_city(df):
    city_df = df.groupby(['country', 'city', 'id', 'price_level'])['name'].count().to_frame()
    price_level = city_df.pivot_table(index=['country', 'city', 'id'], columns='price_level', values='name', aggfunc='first')
    price_level['avg_price'] = df.groupby(['country', 'city', 'id'])['price_level'].mean()

    
    return price_level

In [None]:
food = to_city(all_price)
food

In [None]:
food.reset_index(inplace=True)
food.drop(columns = ['avg_price'], inplace=True)
food

In [None]:
le = pp.LabelEncoder()
food['label_id'] = le.fit_transform(food.city)
food

In [None]:
#Reverse encoder
#list(le.inverse_transform(food.label_id))

In [None]:
food_new = food[['label_id', 1.0, 2.0, 3.0, 4.0]].copy()
food_new

In [None]:
food_new.sort_values('label_id', inplace=True)
food_new.set_index('label_id', inplace=True)

In [None]:
food_new.fillna(0, inplace=True)

In [None]:
food_new

In [None]:
normalized = pp.normalize(food_new)
normalized_city = pd.DataFrame(normalized)
normalized_city

### Survey / user food data

In [None]:
survey_food = pd.read_csv('/Users/tristannisbet/Documents/SM/survey_food_only.csv', index_col=0)
survey_food

In [None]:
replace_map = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Always': 4}

df_food_replace = survey_food.replace(replace_map)

In [None]:
df_food_replace.drop(columns=['nationality', 'age', 'gender'], inplace=True)

In [None]:
normalized_user = pd.DataFrame(pp.normalize(df_food_replace))
normalized_user

In [None]:
normalized_city

In [None]:
cosine_sim = pd.DataFrame(cosine_similarity(normalized_user, normalized_city))
cosine_sim

In [None]:
def find_similar_n(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index, 
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [None]:
top_10_city = find_similar_n(cosine_sim,10)
top_10_city

In [None]:
user_1 = top_10_city.iloc[0, 0:5].values.tolist()
user_1

In [None]:
for city in user_1:
    city2 = food[food.label_id == city]
    print(city2.city)

# Attraction Similarity

In [None]:
city_attraction = pd.read_csv('/Users/tristannisbet/Documents/SM/city_attraction_only.csv', index_col=0)
city_attraction

In [None]:
# Drop Guilin because there were no restaurants found
city_attraction.reset_index(inplace=True)
city_attraction = city_attraction.drop(city_attraction[city_attraction.id == 80].index)
city_attraction.set_index('country')

In [None]:
city_attraction.set_index('country', inplace=True)
city_attraction.sort_values('country')

In [None]:
food

In [None]:
le = pp.LabelEncoder()
city_attraction['label_id'] = le.fit_transform(city_attraction.city)
city_attraction

In [None]:
city_attraction.sort_values('label_id', inplace=True)
city_attraction_clean = city_attraction.copy()
city_attraction_clean.set_index('label_id', inplace=True)
city_attraction_clean.drop(columns=['city', 'id'], inplace=True)
city_attraction_clean

In [None]:
df = pd.read_csv('/Users/tristannisbet/Documents/SM/survey_responses.csv')

In [None]:
df.rename(columns = {'What country are you from? ': 'nationality', 'Age Range': 'age', 'Gender': 'gender',
                    '1. Choose your top favorite 3-5 cities you have traveled to that are on this list.  - Favorite City #1': 'favorite_city_one',
                    '2. Favorite city #2': 'favorite_city_two', '3. Favorite city #3': 'favorite_city_three',
                    '4. Favorite city #4': 'favorite_city_four', '5. Favorite city #5': 'favorite_city_five',
                    "6. If there's a city you have been and loved that is not on this list, add it below. ": 'extra_favorite',
                    "7. What cities on this list have you been to and not enjoyed?   - Least favorite city #1": 'least_favorite_one',
                    '8. Least favorite city #2': 'least_favorite_two', 
                    "9. If there's a city you have been to and haven't liked that is not on this list, add it below": 'extra_least_favorite',
                    "What price range of restaurant do you eat at when you travel? [Price level: 1 (Fast/Cheap Eats)]": 'food_one',
                    "What price range of restaurant do you eat at when you travel? [Price level: 2 (Casual Dining)]": 'food_two',
                    "What price range of restaurant do you eat at when you travel? [Price level: 3 (Upscale Dining)]": 'food_three',
                    "What price range of restaurant do you eat at when you travel? [Price level: 4 (Fine Dining/High End)]": 'food_four',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Art Gallery]": 'art_gallery',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Library]": 'library',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Museum ]": 'museum',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Aquarium]": 'aquarium',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Amusement Park ]": 'amusement_park',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Zoo]": 'zoo',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Movie Theater]": 'movie_theater',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Mall / Souvenir shop ]": 'store',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Park ]": 'park',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Natural Feature / Beach]": 'natural_feature',
                    "From the following types of tourist attractions, which ones are you likely to go do while traveling? [Place of Worship (Church/Temple)]": 'place_of_worship'}, inplace=True )

In [None]:
user_attraction = df[['nationality', 'age', 'gender', 'amusement_park', 'museum', 'park', 'art_gallery', 'aquarium', 'zoo', 
                     'library', 'movie_theater', 'natural_feature', 'place_of_worship', 'store']].copy()
user_attraction

In [None]:
user_attraction.drop(columns=['nationality', 'age', 'gender'], inplace=True)

In [None]:
replace_map = {'1 ( Would NOT go)': 1, '2': 2, '3': 3, '4 (Definitely would go)': 4}

user_attraction = user_attraction.replace(replace_map)


In [None]:
user_attraction

In [None]:
city_attraction_clean

In [None]:
normalized_user_a = pd.DataFrame(pp.normalize(user_attraction))
normalized_user_a

In [None]:
normalized_city_a = pd.DataFrame(pp.normalize(city_attraction_clean))
normalized_city_a

In [None]:
cosine_sim_a = pd.DataFrame(cosine_similarity(normalized_user_a, normalized_city_a))
cosine_sim_a

In [None]:
top_city_a = find_similar_n(cosine_sim_a,10)
top_city_a

### New dataframe build
Every cityxuser in row, 
Food sim, attraction sim, rank columns

In [None]:
cities = get_df('city_country')

In [None]:
cities = cities.drop(cities[cities.city == 'Guilin'].index)


In [None]:
cities

In [None]:
#Adding user_id to cityname 

lst = []
for i in range(153):
    for c in cities.city:

        a = c + "_" + str(i)
        lst.append(a)

In [None]:
new = pd.DataFrame(lst)
new

In [None]:
split = new[0].str.split("_", n = 1, expand = True)
split

In [None]:
split.rename(columns={0:'city', 1:'user'}, inplace=True)
split

In [None]:
sim_score_rank = pd.merge(left=split, right=food[['label_id', 'city']], on='city')
sim_score_rank.rename(columns={'label_id': 'city_id'}, inplace=True)

In [None]:
sim_score_rank

In [None]:
sim_score_rank['user'] = pd.to_numeric(sim_score_rank['user'])


In [None]:
sim_score_rank.sort_values(['user', 'city_id'], inplace=True)

In [None]:
lst = []
lst2 = []
lst3 = []
for user, city_id in zip(sim_score_rank.user, sim_score_rank.city_id):
    lst.append(cosine_sim.iloc[user, city_id])
    lst2.append(user)
    lst3.append(city_id)

In [None]:
sim_score_rank['food_sim'] = lst

In [None]:
sim_score_rank


In [None]:
lst_a = []
for user, city_id in zip(sim_score_rank.user, sim_score_rank.city_id):
        lst_a.append(cosine_sim_a.iloc[user, city_id])


In [None]:
sim_score_rank['attraction_sim'] = lst_a

In [None]:
top_city = df[['favorite_city_one', 'favorite_city_two', 'favorite_city_three', 'favorite_city_four', 'favorite_city_five']].copy()
top_city

In [None]:
top_city.reset_index(inplace=True)
top_city.rename(columns={'index': 'user'}, inplace=True)

In [None]:
top_city

In [None]:
top_city_melt = top_city.melt(id_vars=['user'])

In [None]:
def rank_from_col(x):
    if x.variable=='favorite_city_one':
       return 5
    elif x.variable=='favorite_city_two':
       return 4
    elif x.variable=='favorite_city_three':
       return 3
    elif x.variable=='favorite_city_four':
       return 2
    elif x.variable=='favorite_city_five':
       return 1
    elif x.value is None:
        return 0 
    
    
    
    
def add_5(x):
    if x.variable=='favorite_city_one':
       return .5
    elif x.variable=='favorite_city_two':
       return .5
    elif x.variable=='favorite_city_three':
       return .5
    elif x.variable=='favorite_city_four':
       return .5
    elif x.variable=='favorite_city_five':
       return .5
    elif x.value is None:
        return 0 

In [None]:
top_city_melt['rank'] = top_city_melt.apply(rank_from_col,axis=1)


top_city_melt['var'] = top_city_melt.apply(add_5,axis=1)
top_city_melt

In [None]:
top_city_no_na = top_city_melt.dropna().copy()

In [None]:
top_city_no_na.rename(columns={'value':'city'}, inplace=True)

In [None]:
sim_score_rank.set_index(['user', 'city'], inplace=True)
top_city_no_na.set_index(['user', 'city'], inplace=True)

In [None]:
top_city_no_na[top_city_no_na['rank'] == 5]

In [None]:
top_city_no_na.loc[[4]]

In [None]:
sim_score_rank

In [None]:
sim_score_rank['attraction_sim'].max()

In [None]:
sim_score_rank = pd.merge(left=sim_score_rank, right=top_city_no_na[['rank', 'var']], left_index=True, right_index=True, how='left')


In [None]:
sim_score_rank.fillna(0, inplace=True)

In [None]:
sim_score_rank

In [None]:
sim_score_rank[sim_score_rank['var'] == .5]

In [None]:
sim_score_rank['sum'] = sim_score_rank['food_sim'] + sim_score_rank['attraction_sim']

In [None]:
sim_score_rank

In [None]:
# Sort values by sum for each user group
ordered_sum = sim_score_rank.sort_values('sum', ascending=False).sort_index(level='user', sort_remaining=False)
ordered_sum

In [None]:
ordered_sum['ranking_weight'] = ordered_sum['sum'] + ordered_sum['var']

In [None]:
ordered_sum.sort_values('ranking_weight', ascending=False).sort_index(level='user', sort_remaining=False)

In [None]:
len(ordered_sum.index.unique(level='user'))


In [None]:
ordered_sum.index[0]

In [None]:
def find_top(n):
    top = ordered_sum.groupby('user').apply(lambda x: x.nlargest(n, 'ranking_weight')).droplevel(level=0)
    top_10_rank = top.loc[top['rank'].isin([1.0, 3.0, 2.0, 4.0, 5.0])]
    print( "number of users with a top city", len(top_10_rank.index.unique(level='user')))
    
    top['rank'].hist()
    print("number of ranked cities", np.count_nonzero(top['rank']))
    
    

In [None]:
find_top(3)

In [None]:
np.count_nonzero(ordered_sum['rank'])

In [None]:
top_10 = ordered_sum.groupby('user').apply(lambda x: x.nlargest(10, 'sum')).droplevel(level=0)

#df.groupby('Brand').apply(lambda x: x.nlargest(2, 'Rank')).reset_index(drop=True)  


In [None]:
top_10.info()

In [None]:
np.count_nonzero(top_10['rank'])

In [None]:
top_30

In [None]:
diff = set(split.city).intersection(set(split3.city))
diff = list(diff)
print(len(diff))

In [None]:
ordered_sum

In [None]:
suprise_rank = ordered_sum[['ranking_weight']]

In [None]:
suprise_rank.reset_index(inplace=True)

In [None]:
suprise_rank.to_csv(r'/Users/tristannisbet/Documents/SM/Dataframe/new/suprise_weight.csv')