In [1]:
import pandas as pd
import numpy as np
import sqlite3
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import preprocessing as pp
from sklearn.metrics.pairwise import cosine_similarity
import ast
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2


In [2]:
def get_df(table_name):
    try:
        conn = sqlite3.connect('/Users/tristannisbet/Documents/travel_app/places.db')

    except Exception as e:
        print('Error durring connection: ', str(e))
    
    sql = """select * from {}""".format(table_name)
    df = pd.read_sql_query(sql, conn)

    return df

## Food City

In [3]:
#main call to create df for all food entries and city
def cityFoodMain():
    all_price = createFoodDf()
    all_price_ = cleaningNullsCity(all_price)
    food_city = addNanRowCity(all_price_)
    final_food_city = selectColumns(food_city)
    
    
    return final_food_city, food_city




In [4]:
#1 called
def createFoodDf():
    one = get_df('restaurants_one')
    two = get_df('restaurants_two')
    three = get_df('restaurants_three')
    four = get_df('restaurants_four')
    top_rest = get_df('restaurants')
    
    all_price = pd.concat([one, two, three, four, top_rest], axis =0)
    return(all_price)


#2 called
def cleaningNullsCity(restaurants_all):
    
    restaurants_all['id'] = pd.to_numeric(restaurants_all.id)
    restaurants_all['price_level'] = restaurants_all['price_level'].fillna(restaurants_all.groupby('city')['price_level'].transform('mean'))
    restaurants_all.fillna(2.0, inplace=True)
    #do I need this?
    restaurants_all['price_level'] = restaurants_all['price_level'].astype(int)
    
    city_food = toCityLevel(restaurants_all)
    city_food.drop(columns = ['avg_price'], inplace=True)
    city_food.fillna(0, inplace=True)
    
    return city_food


#3
def toCityLevel(df):
    city_df = df.groupby(['country', 'city', 'id', 'price_level'])['name'].count().to_frame()
    price_level = city_df.pivot_table(index=['country', 'city', 'id'], columns='price_level', values='name', aggfunc='first')
    price_level['avg_price'] = df.groupby(['country', 'city', 'id'])['price_level'].mean()
    


    
    return price_level


In [5]:

#4 called
def addNanRowCity(food_df):
    food_df.reset_index(inplace=True)
    nan_row = {'country' : None, 'city': 'Zx', 'id': 200, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0}
    food_df = food_df.append(nan_row, ignore_index=True)
    
    global food_new 
    food_new = labelEncodeCity(food_df)
    food_new = food_new.drop(food_new[food_new.id == 200].index)
    
    return food_new



# You might need to return both food and food_new. Food has city/country names

#6
def buildLabelEncoder():
    
    cities = get_df('cities')
    new_row = {'id': 200, 'city': 'Zx', 'country': 'None'}
    cities = cities.append(new_row, ignore_index=True)
    
    le = pp.LabelEncoder()
    le.fit(cities.city)
    
    return le
  
 # 5    
def labelEncodeCity(food_df):
    
    le = buildLabelEncoder()
    food_df['label_id'] = le.transform(food_df.city)
    
    return food_df

# 7
def selectColumns(food_df):
    
    food_df = food_df.drop(food_df[food_df.id == 200].index)
    food_city = food_df[['label_id', 1.0, 2.0, 3.0, 4.0]].copy()
    food_city.sort_values('label_id', inplace=True)
    food_city.set_index('label_id', inplace=True)
    
    return food_city
    





### Food User

In [6]:
# This will pull survey data from database and select only food columns
def createFoodUserDf():
    survey = get_df('survey_response')
    #survey = total.copy()
    food_user = survey[['food_one', 'food_two', 'food_three', 'food_four']]
    
    return food_user

In [7]:
# Parameters: city/user data all numeric. 

# sim_city_food is similarity matrix for all cities and food data
# sim_user_food is similarity matrix for all user and food data

# cosine_sim_food is similarit matrix for all usersXcities (153x138)
def simScore(city, user):

    normalized_city = pd.DataFrame(pp.normalize(city))    
    normalized_user = pd.DataFrame(pp.normalize(user))


    sim_city = pd.DataFrame(cosine_similarity(normalized_city))
    sim_user = pd.DataFrame(cosine_similarity(normalized_user))
    
    cosine_sim_food = pd.DataFrame(cosine_similarity(normalized_user, normalized_city))

    
    return cosine_sim_food

In [8]:
food_city2 = cityFoodMain()

In [9]:
food_city2

(price_level     1      2     3     4
 label_id                            
 0            61.0  103.0  46.0  15.0
 1             9.0  109.0  18.0   4.0
 2            10.0  114.0   6.0   2.0
 3             8.0   99.0  77.0   5.0
 4            62.0   96.0  77.0  14.0
 ...           ...    ...   ...   ...
 133          62.0   70.0  82.0  43.0
 134           0.0   66.0   0.0   0.0
 135           0.0   41.0   0.0   0.0
 136           2.0   43.0   6.0   3.0
 137          42.0   96.0  79.0  17.0
 
 [138 rows x 4 columns],
 price_level    country              city   id     1      2     3     4  \
 0            Argentina      Buenos Aires   85   6.0   95.0  68.0  31.0   
 1            Australia         Melbourne   73  18.0   91.0  79.0  19.0   
 2            Australia            Sydney   55  26.0   17.0  67.0  29.0   
 3              Austria            Vienna   37  65.0   91.0  78.0  19.0   
 4              Belgium          Brussels   60  10.0  103.0  44.0  18.0   
 ..                 ...      

In [10]:
cities = get_df('cities')

In [11]:
ok = labelEncodeCity(cities)
ok

Unnamed: 0,id,city,country,continent,label_id
0,1,Hong Kong,Hong Kong,Asia,56
1,2,Bangkok,Thailand,Asia,10
2,3,London,United Kingdom,Europe,75
3,4,Macau,Macau,Asia,77
4,5,Singapore,Singapore,Asia,119
...,...,...,...,...,...
133,135,Accra,Ghana,Africa,1
134,136,Quito,Ecuador,South America,107
135,137,Tianjin,China,Asia,126
136,138,Qingdao,China,Asia,106


In [12]:
ok.sort_values('label_id')

Unnamed: 0,id,city,country,continent,label_id
92,94,Abu Dhabi,United Arab Emirates,Asia,0
133,135,Accra,Ghana,Africa,1
25,26,Agra,India,Asia,2
128,130,Amman,Jordan,Asia,3
24,25,Amsterdam,Netherlands,Europe,4
...,...,...,...,...,...
104,106,Washington D.C.,United States,North America,133
117,119,Xi'an,China,Asia,134
121,123,Xiamen,China,Asia,135
67,68,Zhuhai,China,Asia,136


In [13]:
#Fake
#food_user_f = createFoodUserDf()
#food_user_f
#Fake
#cosine_sim_foodf = simScore(food_city2, food_user_f)
#cosine_sim_foodf

In [14]:
food_user = createFoodUserDf()
food_user

Unnamed: 0,food_one,food_two,food_three,food_four
0,4,4,1,0
1,2,3,2,2
2,2,3,2,1
3,2,3,2,2
4,3,3,1,1
...,...,...,...,...
147,2,3,1,1
148,1,3,2,1
149,2,3,1,1
150,3,4,2,1


In [15]:
cosine_sim_food = simScore(food_city2, food_user)
cosine_sim_food

ValueError: setting an array element with a sequence.

In [None]:
food_new

## Attractions City

In [None]:
# Needs to go into a function


In [16]:
place_of_worship = ['place_of_worship', 'hindu_temple', 'church', 'mosque', 'synagogue']
shopping = ['store', 'shopping_mall', 'clothing_store', 'electronics_store', 'grocery_or_supermarket', 'department_store']

attractions_to_keep = ['amusement_park', 'museum', 'park', 'art_gallery', 'aquarium',
                      'zoo', 'library', 'movie_theater', 'natural_feature'] + place_of_worship + shopping

In [17]:
# Pulls all attraction data from database. Will groupby each attraction type that I want to keep and count.
# Returns: Each city with a count for the specified attractions

def cityAttractionMain():
    attraction_df = get_df('attractions')
    attractions_split = split_types(attraction_df)
    dummy = dummies(attractions_split)
    by_city, all_attractions = attraction_count(dummy, attractions_split)
    city_group = combineAttractionTypes(by_city)
    city_attraction = labelEncodeAttraction(city_group)
    clean_city_attraction, city_attraction = cleanCityAttraction(city_attraction)
    
    return clean_city_attraction, city_attraction

def split_types(df):
    df['split_types'] = [ast.literal_eval(x) for x in df.types]
    df['split_types_str'] = [','.join(x) for x in df.split_types]
    
    return df

def dummies(df):
    dummies = df.split_types_str.str.get_dummies(sep=',')

    return dummies


def attraction_count(dummies_df, all_attractions_df):

    all_attractions_df = pd.concat([all_attractions_df, dummies_df], axis=1)
    type_col_names = attractions_to_keep
    type_col_names.extend(['country', 'city', 'id'])
    attraction_count = all_attractions_df[type_col_names].groupby(['country', 'city', 'id']).sum()
    
    return attraction_count, all_attractions_df

def combineAttractionTypes(city_group):
    city_group['place_of_worship2'] = city_group['place_of_worship'] + city_group['hindu_temple'] + city_group['church'] + city_group['mosque'] + city_group['synagogue']
    city_group['store2'] = city_group['store'] + city_group['shopping_mall'] + city_group['clothing_store'] + city_group['electronics_store'] + city_group['grocery_or_supermarket'] + city_group['department_store']
    
    city_group.rename(columns={"place_of_worship2" : 'place_of_worship', 'store2': 'shop', "place_of_worship" : 'place_of_worship5',}, inplace=True)
    
    city_clean = city_group[['amusement_park', 'art_gallery', 'aquarium', 'library', 'movie_theater',
                              'museum', 'natural_feature', 'park', 'place_of_worship', 'shop', 'zoo']].copy()
    
    return city_clean


def labelEncodeAttraction(city_attraction):
    le = buildLabelEncoder()
    city_attraction.reset_index(inplace=True)
    city_attraction['label_id'] = le.transform(city_attraction.city)
    
    return city_attraction

def cleanCityAttraction(city_attraction):
    city_attraction.sort_values('label_id', inplace=True)
    city_attraction.set_index('label_id', inplace=True)
    city_attraction.drop(columns=['id'], inplace=True)
    city_attraction_clean = city_attraction.drop(columns=['city', 'country'])
    
    return city_attraction_clean, city_attraction


In [None]:
city_attraction, city_attraction_with_country = cityAttractionMain()



In [None]:
city_attraction_with_country

### Attractions User

In [18]:
# This pulls from survey table and selects only attraction colummns
def createAttractionUserDf():
    survey = get_df('survey_response')
    #survey = total.copy()
    user_attraction = survey[['amusement_park', 'art_gallery', 'aquarium', 'library', 'movie_theater',
                              'museum', 'natural_feature', 'park', 'place_of_worship', 'shop', 'zoo']]
    return user_attraction

In [None]:
# Fake
user_attractionf = createAttractionUserDf()


In [None]:
user_attraction = createAttractionUserDf()


In [None]:
user_attraction

In [None]:
city_attraction

In [None]:
city_attraction_with_country

In [None]:
# Fake
cosine_sim_attractionf = simScore(city_attraction, user_attractionf)

cosine_sim_attractionf

In [None]:
cosine_sim_attraction = simScore(city_attraction, user_attraction)

cosine_sim_attraction

In [None]:
cosine_sim_food

### UserxCity matrix sim score


In [19]:
# Survey response data

# This handles top 5 favorite cities

def createSimMatrixMain(table_name, cosine_sim_food, cosine_sim_attraction, food_df_city_name):
    
    top_city_melt = addTopCity(table_name)
    top_city_no_na = dropNullRankCity(top_city_melt)
    # need to call to create cosine_sim_food
    sim_df = createUserCitySimMatrix(cosine_sim_food, cosine_sim_attraction, food_df_city_name)
    
    sim_matrix_ready = mergeRanktoMatrix(sim_df, top_city_no_na)
    
    
    return sim_matrix_ready


def addTopCity(table_name):
    survey = get_df(table_name)

    top_city = survey[['favorite_city_one', 'favorite_city_two', 'favorite_city_three', 'favorite_city_four', 'favorite_city_five']].copy()
    
    top_city.reset_index(inplace=True)
    top_city.rename(columns={'index': 'user'}, inplace=True)
    top_city = top_city.replace({'': np.nan})
    top_city_melt = top_city.melt(id_vars=['user'])
    
    top_city_melt['rank'] = top_city_melt.apply(rank_from_col,axis=1)


    return top_city_melt


def rank_from_col(x):
    if x.variable=='favorite_city_one':
       return 1
    elif x.variable=='favorite_city_two':
       return 1
    elif x.variable=='favorite_city_three':
       return 1
    elif x.variable=='favorite_city_four':
       return 1
    elif x.variable=='favorite_city_five':
       return 1
    elif x.value == 'None':
        return 0 
    
#Drops city ranks with Null values as city
#Sets index
def dropNullRankCity(top_city_melt):
    top_city_no_na = top_city_melt.dropna().copy()
    top_city_no_na.rename(columns={'value':'city'}, inplace=True)
    top_city_no_na.set_index(['user', 'city'], inplace=True)
    
    return top_city_no_na



def mergeRanktoMatrix(sim_df, rank_df):
    
    sim_matrix = pd.merge(left=sim_df, right=rank_df[['rank']], left_index=True, right_index=True, how='left')
    sim_matrix.fillna(0, inplace=True)
    sim_matrix.sort_values('user', inplace=True)
    sim_matrix.set_index('city_id', append=True, inplace=True)

    return sim_matrix


    

In [20]:
# This melts the cosine sim matrix that is userXcity to create my dataset where every userxcity combo is there
# Start with the cosine_food
# Could maybe pull out first couple lines and create own function...

# Survey response data

def createUserCitySimMatrix(cosine_sim_food, cosine_sim_attraction, food_df_city_name):
    cosine_food = cosine_sim_food.reset_index()
    cos_melt = cosine_food.melt(id_vars=['index'], value_name="food_sim", var_name = "city_id")
    cos_melt.rename(columns={'index': 'user'}, inplace=True)
    cos_melt['city_id'] = cos_melt['city_id'].astype(int)
    
    city_dict = dict(zip(food_new['label_id'], food_new['city']))
    
    cos_melt['city'] = cos_melt['city_id'].map(city_dict)
    
    matrix_full = addAttractionSimMatrix(cos_melt, cosine_sim_attraction)

    return matrix_full


# melts attraction cosine matrix and then merge 
def addAttractionSimMatrix(user_city_matrix, cosine_sim_attraction):
    cosine_attraction = cosine_sim_attraction.reset_index()
    cos_melt = cosine_attraction.melt(id_vars=['index'], value_name="attraction_sim", var_name = "city_id")
    cos_melt['city_id'] = cos_melt['city_id'].astype(int)
    
    sim_matrix = pd.merge(right=user_city_matrix, left=cos_melt, right_on=['user', 'city_id'], left_on=['index', 'city_id'])
    
    final_matrix = addSumColumn(sim_matrix)
    
    return final_matrix

def addSumColumn(sim_score_matrix):
        
    sim_score_matrix['sum'] = sim_score_matrix['food_sim'] + sim_score_matrix['attraction_sim']
    clean = cleanMatrix(sim_score_matrix)
    
    return clean

def cleanMatrix(matrix):
    
    matrix.drop(columns=['index'], inplace=True)
    clean = matrix[['user', 'city_id', 'city', 'food_sim', 'attraction_sim', 'sum']]
    clean.set_index(['user', 'city'], inplace=True)
    
    return clean
    
    


In [None]:
full_sim_matrix = createUserCitySimMatrix(cosine_sim_food, cosine_sim_attraction)
full_sim_matrix

In [None]:
#Fake

#work_fake = createUserCitySimMatrix(cosine_sim_foodf, cosine_sim_attractionf)
#work_fake.sort_index(level=0, inplace=True)
#sim_fake = work_fake

In [None]:
# might want to add this. It will sort by each user group by the top sum score

#ordered_sum = sim_score_rank.sort_values('sum', ascending=False).sort_index(level='user', sort_remaining=False)


In [None]:
full_sim_matrix = createSimMatrixMain('survey_response', cosine_sim_food, cosine_sim_attraction)
full_sim_matrix

In [None]:
hmmfix2[hmmfix2['rank'] == 1]

## Survey Response transformation

In [21]:
def transformUserInput(table_name):
    survey = get_df(table_name)
    survey.drop(columns=[''], inplace=True)
    nationality_dict = {'Australia': 1, 'Canada': 2, 'China': 3, 'Finland': 4, 'Honduras': 5,
              'India': 6, 'Israel': 7, 'Japan': 8, 'Mexico': 9, 'Pakistan': 10, 'Philippines': 11, 'United States': 12}

    survey.nationality = survey.nationality.map(nationality_dict)
    survey = survey.replace({'': 'Zx'})
    survey = encodeTopCity(survey)
    survey = survey.apply(pd.to_numeric, errors='ignore')
    finished = userDemographicDummy(survey)
    
    return finished



def encodeTopCity(user_response):
    
    le = buildLabelEncoder()
    user_response['one'] = le.transform(user_response['favorite_city_one'])
    user_response['two'] = le.transform(user_response['favorite_city_two'])
    user_response['three'] = le.transform(user_response['favorite_city_three'])
    user_response['four'] = le.transform(user_response['favorite_city_four'])
    user_response['five'] = le.transform(user_response['favorite_city_five'])
    
    return user_response


def userDemographicDummy(user_response):
    
    ready = user_response.drop(columns=['favorite_city_one', 'favorite_city_two', 'favorite_city_three',
                                          'favorite_city_four', 'favorite_city_five'])
    dummy = pd.get_dummies(ready)
    
    return dummy
    
    

In [None]:
ok = get_df('survey_response')
ok

In [None]:
clean_survey = transformUserInput('survey_response')

In [None]:
finished.columns

In [None]:
#Fake
#fake_survey_finished = pd.get_dummies(survey_fake)
#fake_survey_finished

## Food and Attraction City

In [22]:
# This is not merging on label_id it is merging on id. WHY DO I HAVE ID
def mergeAttractionFood(attraction_df, food_df):
    
    city = pd.merge(left = attraction_df, right = food_df, left_index = True, right_index=True)
    
    city_cont = addContinent(city)
    
    return city_cont


def addContinent(city_df):
    continents = {
    'NA': 'North America',
    'SA': 'South America', 
    'AS': 'Asia',
    'OC': 'Australia',
    'AF': 'Africa',
    'EU': 'Europe'}
    
    city_df['continent'] = [continents[country_alpha2_to_continent_code(country_name_to_country_alpha2(country))] for country in city_df['country']]
    city_df.set_index(['city', 'country'], append=True, inplace=True)
    
    #city_cont_dummy = dummyContinent(city_df)
    
    return city_df


def continentToNumber(city_df):
    
    cont_dict = {''}
    
    return


def dummyContinent(city_df):
    
    city_dummy = pd.get_dummies(city_df)
    
    return city_dummy
    


In [None]:
food_city2

In [None]:
city_attraction_with_country

In [None]:
c = mergeAttractionFood(city_attraction_with_country, food_city2)

In [None]:
c

In [None]:
city_df2 = addContinent(c)

In [None]:
city_df2_dummy = dummyContinent(city_df2)

In [None]:
city_df2_dummy

## Combing all data to get userxcity raw input and sim scores  

all food and attraction data

In [23]:
def userCityCreate(survey_df_clean, city_df_all):
    
    
    reindex_survey = survey_df_clean.reset_index()
    reindex_city = city_df_all.reset_index()
    reindex_city = reindex_city.add_suffix('_city')
    reindex_survey = reindex_survey.add_suffix('_user')
    reindex_city['key'] = 1
    reindex_survey['key'] = 1
    
    full = pd.merge(reindex_city , reindex_survey, on='key').drop('key',axis=1)
    
    raw_inputs_matrix = cleanUserCityMatrix(full)

    return raw_inputs_matrix

def cleanUserCityMatrix(matrix):
    
    matrix.rename(columns={'label_id_city': 'city_id', 'city_city': 'city', 'index_user': 'user'}, inplace=True)
    matrix.drop(columns=['country_city'], inplace=True)
    matrix.set_index(['user', 'city', 'city_id'], inplace=True)
    matrix.sort_index(level=0, inplace=True)

    return matrix
    

    

In [24]:
def finalMerge(raw_df, sim_score_df):
    full_matrix = pd.merge(left= raw_df, right= sim_score_df, right_index=True, left_index=True)
    full_matrix.sort_index(level=0, inplace=True)

    return full_matrix

In [25]:
def startToEnd():
    
    food_city, food_city_name = cityFoodMain()
    food_user = createFoodUserDf()
    cosine_sim_food = simScore(food_city, food_user)
    
    city_attraction, city_attraction_with_country = cityAttractionMain()
    user_attraction = createAttractionUserDf()    
    cosine_sim_attraction = simScore(city_attraction, user_attraction)
    
    full_sim_matrix = createSimMatrixMain('survey_response', cosine_sim_food, cosine_sim_attraction, food_city_name)
    
    survey_clean = transformUserInput('survey_response')
    
    full_city_data = mergeAttractionFood(city_attraction_with_country, food_city)
    
    full_raw_matrix = userCityCreate(survey_clean, full_city_data)
    
    finished = finalMerge(full_raw_matrix, full_sim_matrix)
    
    return finished





In [26]:
testing = startToEnd()

In [27]:
testing

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,amusement_park_city,art_gallery_city,aquarium_city,library_city,movie_theater_city,museum_city,natural_feature_city,park_city,place_of_worship_city,shop_city,...,age_15-25_user,age_26-40_user,age_41-55_user,age_55+_user,gender_Female_user,gender_Male_user,food_sim,attraction_sim,sum,rank
user,city,city_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,Abu Dhabi,0,3,4,0,0,0,6,1,18,2,0,...,0,1,0,0,1,0,0.946454,0.599510,1.545963,0.0
0,Accra,1,0,1,0,0,0,4,2,1,20,2,...,0,1,0,0,1,0,0.769044,0.584991,1.354035,0.0
0,Agra,2,0,0,0,0,0,5,1,6,44,0,...,0,1,0,0,1,0,0.762457,0.525505,1.287962,0.0
0,Amman,3,0,2,0,1,0,11,0,3,12,2,...,0,1,0,0,1,0,0.698948,0.626382,1.325330,0.0
0,Amsterdam,4,0,2,1,0,1,29,0,7,12,1,...,0,1,0,0,1,0,0.891063,0.496455,1.387518,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,Washington D.C.,133,0,1,0,1,0,16,0,23,6,0,...,0,1,0,0,0,1,0.833326,0.635983,1.469309,0.0
151,Xi'an,134,0,0,0,0,0,10,0,5,12,0,...,0,1,0,0,0,1,0.688247,0.546102,1.234349,0.0
151,Xiamen,135,0,0,1,0,0,5,1,9,3,0,...,0,1,0,0,0,1,0.688247,0.657949,1.346196,0.0
151,Zhuhai,136,4,0,0,0,0,3,1,5,5,0,...,0,1,0,0,0,1,0.742496,0.591561,1.334057,0.0


In [28]:
testing.columns

Index(['amusement_park_city', 'art_gallery_city', 'aquarium_city',
       'library_city', 'movie_theater_city', 'museum_city',
       'natural_feature_city', 'park_city', 'place_of_worship_city',
       'shop_city', 'zoo_city', '1_city', '2_city', '3_city', '4_city',
       'continent_city', 'nationality_user', 'food_one_user', 'food_two_user',
       'food_three_user', 'food_four_user', 'art_gallery_user', 'library_user',
       'museum_user', 'aquarium_user', 'amusement_park_user', 'zoo_user',
       'movie_theater_user', 'shop_user', 'park_user', 'natural_feature_user',
       'place_of_worship_user', 'one_user', 'two_user', 'three_user',
       'four_user', 'five_user', 'age_15-25_user', 'age_26-40_user',
       'age_41-55_user', 'age_55+_user', 'gender_Female_user',
       'gender_Male_user', 'food_sim', 'attraction_sim', 'sum', 'rank'],
      dtype='object')

In [29]:
testing.to_csv(r"/Users/tristannisbet/Documents/SM/Dataframe/new/production_dataframe.csv")

In [None]:
done = userCityCreate(finished, c)
done

In [None]:
done.columns

In [None]:
done_dummy = userCityCreate(finished, city_df2_dummy)
done_dummy

In [None]:
done_dummy.columns

In [None]:
#Fake

#done_fake2 = userCityCreate(fake_survey_finished, city4)
#done_fake2.set_index('user', append=True, inplace=True)
#fake_survey_finished.reset_index(drop=True, inplace=True)

In [None]:
hmmfix2

In [None]:
finished = finalMerge(done, hmmfix2)

In [None]:
finished.sort_index(level=0, inplace=True)

In [None]:
finished.columns

In [None]:
# Fake
#final = finalMerge(done_fake2, sim_matrix_fake)


In [None]:
#final = final.reorder_levels(['user','city','city_id'])
final.sort_index(level=0, inplace=True)

In [None]:
finished.to_csv(r"/Users/tristannisbet/Documents/SM/Dataframe/new/users_design_matrix_cont1.csv")

## Fake Data

In [None]:

real_data = pd.read_csv('/Users/tristannisbet/Documents/SM/Dataframe/new/all_city_data.csv')
real_data

In [None]:
fake_data_n = pd.read_csv('/Users/tristannisbet/Documents/SM/Dataframe/new/500_users_noise.csv')
fake_data_n

In [None]:
total = pd.concat([fake_data_n, real_data], axis=0)
total

In [None]:
total.rename(columns={'1': 'food_one', '2': 'food_two', '3': 'food_three', '4': 'food_four'}, inplace=True)


## Fake user data Demographic questions


In [None]:
nationality_dict = {1: 'Australia', 2:'Canada', 3:'China', 4:'Finland', 5:'Honduras',
              6:'India', 7:'Israel', 8:'Japan', 9:'Mexico', 10:'Pakistan', 11:'Philippines', 12:'United States'}

gender_dict = {1: 'Male', 2: 'Female'}

age_dict = {1 : '15-25', 2: '26-40', 3: '41-55', 4: '55+'}

random_gender = ["Male", 'Female']
random_age = ['15-25', '26-40', '41-55', '55+']

In [None]:
survey_fake = total.drop(columns=['city', 'label_id', 'country'])

In [None]:
survey_fake['nationality'] = np.random.randint(0,12, len(survey_fake))

survey_fake['gender'] = np.random.choice(random_gender, size=len(survey_fake))
survey_fake['age'] = np.random.choice(random_age, size=len(survey_fake))

In [None]:
survey_fake['one'] = np.random.randint(0,137, len(survey_fake))
survey_fake['two'] = np.random.randint(0,137, len(survey_fake))
survey_fake['three'] = np.random.randint(0,137, len(survey_fake))
survey_fake['four'] = np.random.randint(0,137, len(survey_fake))
survey_fake['five'] = np.random.randint(0,137, len(survey_fake))

In [None]:
survey_fake.reset_index(inplace=True)
survey_fake.rename(columns={'index': 'user'}, inplace=True)

In [None]:
top_fake = survey_fake[['user', 'one', 'two', 'three', 'four', 'five']]

In [None]:
top_fake

In [None]:
top_fake_melt = top_fake.melt(id_vars=['user'])
    
top_fake_melt['rank'] = top_fake_melt.apply(rank_from_col2,axis=1)



def rank_from_col2(x):
    if x.variable=='one':
       return 5
    elif x.variable=='two':
       return 4
    elif x.variable=='three':
       return 3
    elif x.variable=='four':
       return 2
    elif x.variable=='five':
       return 1
    elif x.value == 'None':
        return 0 

In [None]:
top_fake_melt.rename(columns={'value':'city_id'}, inplace=True)
top_fake_melt.set_index(['user', 'city_id'], inplace=True)
top_fake_melt

In [None]:
work_fake.reset_index(level=1, inplace=True)
work_fake.set_index('city_id', append=True, inplace=True)
work_fake

In [None]:
sim_matrix_fake = pd.merge(left=work_fake, right=top_fake_melt[['rank']], left_index=True, right_index=True, how='left')
#sim_matrix.fillna(0, inplace=True)
#sim_matrix.sort_values('user', inplace=True)
#sim_matrix.set_index('city_id', append=True, inplace=True)

In [None]:
sim_matrix_fake.drop(columns=['rank_x'], inplace=True)

In [None]:
sim_matrix_fake.fillna(0, inplace=True)

In [None]:
def add_top_city_fake(test_user):
    possible_cities = test_user['city_id'].head(50).values.tolist()
    sampled_list = random.sample(possible_cities, 5)
    test_user.loc[test_user.city_id == sampled_list[0], "rank"] = 5
    test_user.loc[test_user.city_id == sampled_list[1], "rank"] = 4
    test_user.loc[test_user.city_id == sampled_list[2], "rank"] = 3
    test_user.loc[test_user.city_id == sampled_list[3], "rank"] = 2
    test_user.loc[test_user.city_id == sampled_list[4], "rank"] = 1
    
    return test_user
    
    

In [None]:
sim_fake['rank'] = 0

In [None]:
sim_fake.groupby('user').apply(add_top_city_fake) 

#df.groupby('columnName').apply(myFunction, ('arg1')


In [None]:
np.count_nonzero(sim_fake['rank'])

In [None]:

test_user = sim_fake.xs(0, level='user', drop_level=False).copy()

In [None]:
test_user.sort_values('sum', ascending=False, inplace=True)

In [None]:
possible_city = test_user['city_id'].head(50)

In [None]:
possible_cities = test_user['city_id'].head(50).values.tolist()

In [None]:
test_user

In [None]:
sampled_list = random.sample(possible_cities, 5)
sampled_list

In [None]:

for u in sampled_list:
    print(u)
    test_user[test_use['city_id'] == u]

In [None]:
test_user['rank'] = 0

In [None]:
ok2 = add_top_city_fake(test_user)
ok2

In [None]:
ok2[ok2['rank'] == 3]

In [None]:
np.count_nonzero(test_user['rank'])

In [None]:
test_user.loc[test_user.city_id == 69, "rank"] = 1

In [None]:
test_user[test_user['rank'] == 1]
    

In [None]:
hmmfix2[hmmfix2['rank'] == 1]