# Explore Dataset

In [1]:
# import
import os
import pandas as pd
from functools import reduce
from module.config import PROJ_DIR
from IPython.display import display

In [2]:
# utils functions

## Data exploration

## Data mining
The goal is to reorganize our dataset. We analyse an element in dataset according to permanent or non permanent.

For example latitude, longitude of user will go to criterai until users can ask for many recommandations at different places. But restaurant address is fixed and will go directly to theirs. 

- **User**
- **Criteria**
- **Restaurant**

### Users

In [3]:
# user cuisine
user_cuisine_file = os.path.join(PROJ_DIR, "data/temp/users/usercuisine.csv")
user_cuisine_df = pd.read_csv(user_cuisine_file)

# user payment
user_payment_file = os.path.join(PROJ_DIR, "data/temp/users/userpayment.csv")
user_payment_df = pd.read_csv(user_payment_file)

# user profile
user_profile_file = os.path.join(PROJ_DIR, "data/temp/users/userprofile.csv")
user_profile_df = pd.read_csv(user_profile_file)

# merge user_df
user_df = user_profile_df.merge(user_payment_df, on='userID')
user_df = user_df.merge(user_cuisine_df, on='userID')

display(user_profile_df.columns)
display(user_payment_df.columns)
display(user_cuisine_df.columns)

Index(['userID', 'latitude', 'longitude', 'smoker', 'drink_level',
       'dress_preference', 'ambience', 'transport', 'marital_status', 'hijos',
       'birth_year', 'interest', 'personality', 'religion', 'activity',
       'color', 'weight', 'budget', 'height'],
      dtype='object')

Index(['userID', 'Upayment'], dtype='object')

Index(['userID', 'Rcuisine'], dtype='object')

In [4]:
display(user_df.sample(2))

Unnamed: 0,userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,...,interest,personality,religion,activity,color,weight,budget,height,Upayment,Rcuisine
347,U1135,22.170396,-100.949936,False,casual drinker,informal,family,on foot,single,kids,...,variety,hunter-ostentatious,Catholic,student,purple,66,low,1.54,cash,International
292,U1125,22.19204,-100.956935,False,casual drinker,formal,friends,public,single,independent,...,technology,hard-worker,Catholic,student,white,57,low,1.67,cash,Mexican


In [5]:
# define cols
profile_cols = ['userID', 'birth_year', 'personality', 'interest', 'religion', 
                'marital_status', 'activity', 'ambience', 
                'smoker', 'drink_level', 'hijos',
                'color', 'weight', 'height'    
]

criteria_cols = ['userID', 'latitude', 'longitude', 'Upayment', 'Rcuisine',
                'budget', 'dress_preference', 'transport'
]

In [6]:
# dataframe
profile_df = user_df[profile_cols].copy()
criteria_df = user_df[criteria_cols].copy()

display(profile_df.sample(2))
display(criteria_df.sample(2))

Unnamed: 0,userID,birth_year,personality,interest,religion,marital_status,activity,ambience,smoker,drink_level,hijos,color,weight,height
157,U1075,1991,thrifty-protector,technology,Catholic,single,student,family,False,casual drinker,independent,white,68,1.6
373,U1135,1988,hunter-ostentatious,variety,Catholic,single,student,family,False,casual drinker,kids,purple,66,1.54


Unnamed: 0,userID,latitude,longitude,Upayment,Rcuisine,budget,dress_preference,transport
210,U1103,23.752265,-99.16859,bank_debit_cards,Mexican,medium,formal,public
157,U1075,22.167575,-100.960364,cash,Mexican,medium,informal,car owner


### Restaurants

In [7]:
# restaurant accepts
resto_accepts_file = os.path.join(PROJ_DIR, "data/temp/restaurants/chefmozaccepts.csv")
resto_accepts_df = pd.read_csv(resto_accepts_file)

# restaurant cuisine
resto_cuisine_file = os.path.join(PROJ_DIR, "data/temp/restaurants/chefmozcuisine.csv")
resto_cuisine_df = pd.read_csv(resto_cuisine_file)

# restaurant hours
resto_hours_file = os.path.join(PROJ_DIR, "data/temp/restaurants/chefmozhours4.csv")
resto_hours_df = pd.read_csv(resto_hours_file)

# restaurant parking
resto_parking_file = os.path.join(PROJ_DIR, "data/temp/restaurants/chefmozparking.csv")
resto_parking_df = pd.read_csv(resto_parking_file)

# restaurant geoplace
resto_geoplace_file = os.path.join(PROJ_DIR, "data/temp/restaurants/geoplaces2.csv")
resto_geoplace_df = pd.read_csv(resto_geoplace_file , encoding='latin-1')

# merge resto_df
dfs = [resto_accepts_df, resto_cuisine_df, resto_hours_df, 
        resto_parking_df, resto_geoplace_df]
resto_df = reduce(lambda left,right: pd.merge(left,right,on='placeID'), dfs)

display(resto_df.columns)

Index(['placeID', 'Rpayment', 'Rcuisine', 'hours', 'days', 'parking_lot',
       'latitude', 'longitude', 'the_geom_meter', 'name', 'address', 'city',
       'state', 'country', 'fax', 'zip', 'alcohol', 'smoking_area',
       'dress_code', 'accessibility', 'price', 'url', 'Rambience', 'franchise',
       'area', 'other_services'],
      dtype='object')

In [8]:
# order columns
resto_cols = ['placeID', 'name', 'address', 'latitude', 'longitude', 
            'city', 'state', 'country', 'fax', 'zip', 'the_geom_meter', 
            'hours', 'days', 'Rpayment', 'Rcuisine', 'parking_lot',
            'alcohol', 'smoking_area', 'dress_code', 'accessibility', 
            'price','Rambience', 'franchise','area', 'other_services', 
            'url'
]

In [9]:
# dataframe
restaurant_df = resto_df[resto_cols].copy()
display(restaurant_df.sample(2))

Unnamed: 0,placeID,name,address,latitude,longitude,city,state,country,fax,zip,...,alcohol,smoking_area,dress_code,accessibility,price,Rambience,franchise,area,other_services,url
37,135086,Mcdonalds Parque Tangamanga,Lateral Salvador Nava Martinez 3145,22.141421,-101.013955,San Luis Potosi,SLP,Mexico,?,78290,...,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,t,closed,none,?
640,132572,Cafe Chaires,?,22.141647,-100.992712,San Luis Potosi,San Luis Potosi,Mexico,?,?,...,No_Alcohol_Served,not permitted,informal,completely,low,familiar,f,closed,none,?


### Ratings

In [10]:
# restaurant accepts
ratings_file = os.path.join(PROJ_DIR, "data/temp/ratings/rating_final.csv")
ratings_df = pd.read_csv(ratings_file)

display(ratings_df.sample(2))

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
499,U1069,132845,0,0,0
709,U1032,135060,1,1,1


### Export

In [11]:
# profile & restaurant must be unique
profile_df.drop_duplicates(subset=['userID'])

Unnamed: 0,userID,birth_year,personality,interest,religion,marital_status,activity,ambience,smoker,drink_level,hijos,color,weight,height
0,U1001,1989,thrifty-protector,variety,none,single,student,family,false,abstemious,independent,black,69,1.77
1,U1002,1990,hunter-ostentatious,technology,Catholic,single,student,family,false,abstemious,independent,red,40,1.87
2,U1003,1989,hard-worker,none,Catholic,single,student,family,false,social drinker,independent,blue,60,1.69
3,U1004,1940,hard-worker,variety,none,single,professional,family,false,abstemious,independent,green,44,1.53
21,U1005,1992,thrifty-protector,none,Catholic,single,student,family,false,abstemious,independent,black,65,1.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,U1134,1991,hard-worker,variety,Catholic,single,student,family,false,casual drinker,independent,black,52,1.65
305,U1135,1988,hunter-ostentatious,variety,Catholic,single,student,family,false,casual drinker,kids,purple,66,1.54
408,U1136,1990,thrifty-protector,retro,Catholic,single,student,friends,true,social drinker,independent,black,50,1.60
409,U1137,1989,hard-worker,eco-friendly,Catholic,single,student,family,false,social drinker,independent,blue,72,1.78


In [12]:
# export dataset
profile_df.to_csv(os.path.join(PROJ_DIR, "data/temp/clean/user_profile.csv"))
criteria_df.to_csv(os.path.join(PROJ_DIR, "data/temp/clean/user_criteria.csv"))
restaurant_df.to_csv(os.path.join(PROJ_DIR, "data/temp/clean/restaurant.csv"))
ratings_df.to_csv(os.path.join(PROJ_DIR, "data/temp/clean/rating.csv"))