# Explore Dataset

In [24]:
# import
import os
import pandas as pd
from functools import reduce
from module.config import PROJ_DIR
from IPython.display import display

In [3]:
# utils functions

## Data exploration

## Data mining
The goal is to reorganize our dataset. We analyse an element in dataset according to permanent or non permanent.

For example latitude, longitude of user will go to criterai until users can ask for many recommandations at different places. But restaurant address is fixed and will go directly to theirs. 

- **User**
- **Criteria**
- **Restaurant**

### Users

In [13]:
# user cuisine
user_cuisine_file = os.path.join(PROJ_DIR, "data/temp/users/usercuisine.csv")
user_cuisine_df = pd.read_csv(user_cuisine_file)

# user payment
user_payment_file = os.path.join(PROJ_DIR, "data/temp/users/userpayment.csv")
user_payment_df = pd.read_csv(user_payment_file)

# user profile
user_profile_file = os.path.join(PROJ_DIR, "data/temp/users/userprofile.csv")
user_profile_df = pd.read_csv(user_profile_file)

# merge user_df
user_df = user_profile_df.merge(user_payment_df, on='userID')
user_df = user_df.merge(user_cuisine_df, on='userID')

display(user_profile_df.columns)
display(user_payment_df.columns)
display(user_cuisine_df.columns)

Index(['userID', 'latitude', 'longitude', 'smoker', 'drink_level',
       'dress_preference', 'ambience', 'transport', 'marital_status', 'hijos',
       'birth_year', 'interest', 'personality', 'religion', 'activity',
       'color', 'weight', 'budget', 'height'],
      dtype='object')

Index(['userID', 'Upayment'], dtype='object')

Index(['userID', 'Rcuisine'], dtype='object')

In [14]:
display(user_df.sample(2))

Unnamed: 0,userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,...,interest,personality,religion,activity,color,weight,budget,height,Upayment,Rcuisine
271,U1108,22.143524,-100.987562,False,abstemious,informal,solitary,public,single,independent,...,technology,thrifty-protector,Catholic,student,blue,76,medium,1.81,MasterCard-Eurocard,Seafood
347,U1135,22.170396,-100.949936,False,casual drinker,informal,family,on foot,single,kids,...,variety,hunter-ostentatious,Catholic,student,purple,66,low,1.54,cash,International


In [22]:
# define cols
profile_cols = ['userID', 'birth_year', 'personality', 'interest', 'religion', 
                'marital_status', 'activity', 'ambience', 
                'smoker', 'drink_level', 'hijos',
                'color', 'weight', 'height'    
]

criteria_cols = ['userID', 'latitude', 'longitude', 'Upayment', 'Rcuisine',
                'budget', 'dress_preference', 'transport'
]

In [23]:
# dataframe
profile_df = user_df[profile_cols].copy()
criteria_df = user_df[criteria_cols].copy()

display(profile_df.sample(2))
display(criteria_df.sample(2))

Unnamed: 0,userID,birth_year,personality,interest,religion,marital_status,activity,ambience,smoker,drink_level,hijos,color,weight,height
25,U1008,1989,hard-worker,technology,Catholic,single,student,solitary,False,social drinker,independent,green,68,1.72
397,U1135,1988,hunter-ostentatious,variety,Catholic,single,student,family,False,casual drinker,kids,purple,66,1.54


Unnamed: 0,userID,latitude,longitude,Upayment,Rcuisine,budget,dress_preference,transport
371,U1135,22.170396,-100.949936,cash,Armenian,low,informal,on foot
284,U1117,18.875641,-99.220737,cash,Turkish,medium,no preference,public


### Restaurants

In [30]:
# restaurant accepts
resto_accepts_file = os.path.join(PROJ_DIR, "data/temp/restaurants/chefmozaccepts.csv")
resto_accepts_df = pd.read_csv(resto_accepts_file)

# restaurant cuisine
resto_cuisine_file = os.path.join(PROJ_DIR, "data/temp/restaurants/chefmozcuisine.csv")
resto_cuisine_df = pd.read_csv(resto_cuisine_file)

# restaurant hours
resto_hours_file = os.path.join(PROJ_DIR, "data/temp/restaurants/chefmozhours4.csv")
resto_hours_df = pd.read_csv(resto_hours_file)

# restaurant parking
resto_parking_file = os.path.join(PROJ_DIR, "data/temp/restaurants/chefmozparking.csv")
resto_parking_df = pd.read_csv(resto_parking_file)

# restaurant geoplace
resto_geoplace_file = os.path.join(PROJ_DIR, "data/temp/restaurants/geoplaces2.csv")
resto_geoplace_df = pd.read_csv(resto_geoplace_file , encoding='latin-1')

# merge resto_df
dfs = [resto_accepts_df, resto_cuisine_df, resto_hours_df, 
        resto_parking_df, resto_geoplace_df]
resto_df = reduce(lambda left,right: pd.merge(left,right,on='placeID'), dfs)

display(resto_df.columns)

Index(['placeID', 'Rpayment', 'Rcuisine', 'hours', 'days', 'parking_lot',
       'latitude', 'longitude', 'the_geom_meter', 'name', 'address', 'city',
       'state', 'country', 'fax', 'zip', 'alcohol', 'smoking_area',
       'dress_code', 'accessibility', 'price', 'url', 'Rambience', 'franchise',
       'area', 'other_services'],
      dtype='object')

In [31]:
# order columns
resto_cols = ['placeID', 'name', 'address', 'latitude', 'longitude', 
            'city', 'state', 'country', 'fax', 'zip', 'the_geom_meter', 
            'hours', 'days', 'Rpayment', 'Rcuisine', 'parking_lot',
            'alcohol', 'smoking_area', 'dress_code', 'accessibility', 
            'price','Rambience', 'franchise','area', 'other_services', 
            'url'
]

In [34]:
# dataframe
restaurant_df = resto_df[resto_cols].copy()
display(restaurant_df.sample(2))

Unnamed: 0,placeID,name,address,latitude,longitude,city,state,country,fax,zip,...,alcohol,smoking_area,dress_code,accessibility,price,Rambience,franchise,area,other_services,url
595,132732,Taqueria EL amigo,Calle Mezquite Fracc Framboyanes,23.754357,-99.171288,Cd Victoria,Tamaulipas,Mexico,?,87018,...,No_Alcohol_Served,none,casual,completely,low,familiar,f,open,none,?
572,132755,La Estrella de Dimas,Av. de los Pintores,22.153324,-101.019546,San Luis Potosi,S.L.P.,Mexico,?,?,...,No_Alcohol_Served,none,informal,partially,medium,familiar,f,closed,variety,?


In [None]:
# profile & restaurant must be unique
profile_df.drop_duplicates(subset=['userID'])

In [35]:
# export dataset
profile_df.to_csv(os.path.join(PROJ_DIR, "data/temp/clean/user_profile.csv"))
criteria_df.to_csv(os.path.join(PROJ_DIR, "data/temp/clean/user_criteria.csv"))
restaurant_df.to_csv(os.path.join(PROJ_DIR, "data/temp/clean/restaurant.csv"))