In [1]:
import pandas as pd
import numpy as np
import warnings, sys
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
class DataHandle:

    @classmethod
    def get_data(self):
        """ Get data from GCS Bucket """
        print('[1/2] Getting data... ', end='')
        df1 = pd.read_csv('https://storage.googleapis.com/h3-data/listings_final.csv', sep=';')
        df2 = pd.read_csv('https://storage.googleapis.com/h3-data/price_availability.csv', sep=';')
        print('Done.')
        return [df1, df2]
    
    @classmethod
    def get_group_data(self, data):
        """ Merge both dataframes' data """
        print('[2/2] Merging data... ', end='')
        result = pd.merge(data[0], data[1].groupby('listing_id').local_price.mean('local_price'), how='inner', on='listing_id')
        print('Done.')
        return result
    
    @classmethod
    def get_process_data(self):
        """ Get & Merge data """
        print("===| DataHandle |===")
        result = self.get_group_data(self.get_data())
        print()
        return result
    

In [3]:
df = DataHandle.get_process_data()

===| DataHandle |===
[1/2] Getting data... Done.
[2/2] Merging data... Done.



In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,listing_id,name,type,city,neighborhood,latitude,longitude,person_capacity,beds,bedrooms,bathrooms,is_rebookable,is_new_listing,is_fully_refundable,is_host_highly_rated,is_business_travel_ready,pricing_weekly_factor,pricing_monthly_factor,local_price
0,0,28581061,La maison Clery,private_room,Paris,2e arrondissement,48.869292,2.348335,1,1,1,2.0,False,True,True,False,False,1.0,1.0,45.948454
1,1,661961,studio PARIS PLACE EDITH PIAF 75020,entire_home,Paris,,48.867284,2.403255,2,1,1,1.0,False,False,True,True,False,0.88,0.69,44.473684
2,2,1261705,chambre privée à louer @ paris oberkampf,private_room,Paris,,48.867894,2.375897,1,1,1,1.0,False,False,True,True,False,1.0,1.0,49.0
3,3,1318834,Appartement au coeur du Marais,entire_home,Paris,République,48.87037,2.35851,3,2,2,1.0,False,False,True,False,False,0.82,0.48,165.0
4,4,1677091,Lovely & Quiet flat,entire_home,Paris,Buttes-Chaumont - Belleville,48.874149,2.3737,2,1,1,1.0,False,False,True,True,False,0.95,0.9,69.774436


In [5]:
df.shape

(999, 20)

In [6]:
df.dtypes

Unnamed: 0                    int64
listing_id                    int64
name                         object
type                         object
city                         object
neighborhood                 object
latitude                    float64
longitude                   float64
person_capacity               int64
beds                          int64
bedrooms                      int64
bathrooms                   float64
is_rebookable                  bool
is_new_listing                 bool
is_fully_refundable            bool
is_host_highly_rated           bool
is_business_travel_ready       bool
pricing_weekly_factor       float64
pricing_monthly_factor      float64
local_price                 float64
dtype: object

In [7]:
class FeatureRecipe:
    data = None
    variable_types = None
    dropped_columns = []
    
    
    def __init__(self, data):
        self.data = data

        
    def separate_variable_types(self) -> None:
        """ Separate column variable types on lists """
        print('[1/5] Separate variable types... ', end='')
        
        discreet, continues, boolean, categorical = [], [], [], []
        for column in self.data.columns:
            if self.data[column].dtype == np.dtype('int64'):
                discreet.append(self.data[column].name)
            elif self.data[column].dtype == np.dtype('float64'):
                continues.append(self.data[column].name)
            elif self.data[column].dtype == np.dtype('bool'):
                boolean.append(self.data[column].name)
            else:
                categorical.append(self.data[column].name)    
        self.variable_types = {"discreet": discreet, "continues": continues, "boolean": boolean, "categorical": categorical}

        print("Done.")
    
    
    def drop_uselessf(self):
        """ Drop useless columns """
        print('[2/5] Dropping useless features... ', end='')
        
        if "Unnamed: 0" in self.data.columns:
            self.dropped_columns.append('Unnamed: 0')
            self.data.drop(columns=['Unnamed: 0'], inplace=True)
            
        for column in self.data.columns:
            if self.data[column].isna().sum == len(self.data[column]):
                self.dropped_columns.append(self.data[column].name)
                self.data.drop(columns=[column], inplace=True)
                
        print("Done.")
        
        
    def deal_duplicate(self):
        """ TODO : Supprimer les lignes dupliquées du dataset """
        print('[3/5] TODO ')
        pass
    
    
    def drop_nanp(self, threshold: float):
        """ TODO : Supprimer les colonnes ayant un certain pourcentage de NA dans le dataset """
        print('[4/5] TODO ')
        pass
    
    
    def deal_dtime(self):
        """ TODO : Traiter les DateTime """
        print('[5/5] TODO ')
        pass

    
    def prepare_data(self, threshold: float):
        print("===| FeatureRecipe |===")
        feature_recipe.separate_variable_types()
        feature_recipe.drop_uselessf()
        feature_recipe.deal_duplicate()
        feature_recipe.drop_nanp(threshold)
        feature_recipe.deal_dtime()
        
        print()
        print("Variable types :")
        for vtype in self.variable_types.keys():
            print("- " + str(vtype) + " : " + str(self.variable_types[vtype]))
        print()
        print("Dropped columns : " + str(self.dropped_columns))
        print()


In [8]:
feature_recipe = FeatureRecipe(df)
feature_recipe.prepare_data(5)

===| FeatureRecipe |===
[1/5] Separate variable types... Done.
[2/5] Dropping useless features... Done.
[3/5] TODO 
[4/5] TODO 
[5/5] TODO 

Variable types :
- discreet : ['Unnamed: 0', 'listing_id', 'person_capacity', 'beds', 'bedrooms']
- continues : ['latitude', 'longitude', 'bathrooms', 'pricing_weekly_factor', 'pricing_monthly_factor', 'local_price']
- boolean : ['is_rebookable', 'is_new_listing', 'is_fully_refundable', 'is_host_highly_rated', 'is_business_travel_ready']
- categorical : ['name', 'type', 'city', 'neighborhood']

Dropped columns : ['Unnamed: 0']



In [9]:
df.head()

Unnamed: 0,listing_id,name,type,city,neighborhood,latitude,longitude,person_capacity,beds,bedrooms,bathrooms,is_rebookable,is_new_listing,is_fully_refundable,is_host_highly_rated,is_business_travel_ready,pricing_weekly_factor,pricing_monthly_factor,local_price
0,28581061,La maison Clery,private_room,Paris,2e arrondissement,48.869292,2.348335,1,1,1,2.0,False,True,True,False,False,1.0,1.0,45.948454
1,661961,studio PARIS PLACE EDITH PIAF 75020,entire_home,Paris,,48.867284,2.403255,2,1,1,1.0,False,False,True,True,False,0.88,0.69,44.473684
2,1261705,chambre privée à louer @ paris oberkampf,private_room,Paris,,48.867894,2.375897,1,1,1,1.0,False,False,True,True,False,1.0,1.0,49.0
3,1318834,Appartement au coeur du Marais,entire_home,Paris,République,48.87037,2.35851,3,2,2,1.0,False,False,True,False,False,0.82,0.48,165.0
4,1677091,Lovely & Quiet flat,entire_home,Paris,Buttes-Chaumont - Belleville,48.874149,2.3737,2,1,1,1.0,False,False,True,True,False,0.95,0.9,69.774436


In [10]:
df.shape

(999, 19)