# Notebook Cleaning

In [186]:
import pandas as pd
import numpy as np
import seaborn as sns

In [187]:
url = r"..\data\raw\raw_huis_te_koop.csv"
house = pd.read_csv(url, sep=",")
url = r"..\data\raw\raw_apartement_te_koop.csv"
app = pd.read_csv(url, sep=",")

### Remove duplicates

In [188]:
def removedup_id(df):
    dup = df.duplicated(subset=["property_id"]).sum()
    print(dup)
    df.drop_duplicates(subset=["property_id"],keep="first", inplace=True)
    dup = df.duplicated(subset=["property_id"]).sum()
    print(dup)
    

In [189]:
removedup_id(house)
removedup_id(app)

2239
0
122
0


### drop streetname and nr

In [190]:
def remove_street_nr(df):
    df.drop(["street_name", "house_number"], axis="columns", inplace= True)
    return df


In [191]:
remove_street_nr(house)
remove_street_nr(app)

Unnamed: 0,property_id,locality_name,postal_code,latitude,longitude,property_type,property_subtype,price,type_of_sale,number_of_rooms,...,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_good,number_of_facades,swimming_pool,state_of_building
0,11151864,Turnhout,2300,51.316287,4.932205,HOUSE,HOUSE,175000.0,BUY_REGULAR,,...,,1,1.0,,1.0,,227.0,3.0,,TO_RENOVATE
1,11154023,Beringen,3582,51.065132,5.227231,HOUSE,HOUSE,249000.0,BUY_REGULAR,1.0,...,,0,,,,,1186.0,4.0,,TO_BE_DONE_UP
2,11141961,SAINT-JOSSE-TEN-NOODE,1210,,,APARTMENT,FLAT_STUDIO,120000.0,BUY_REGULAR,,...,,0,1.0,6.0,,,,,,TO_BE_DONE_UP
3,11150522,Gavere,9890,50.914593,3.650555,HOUSE_GROUP,HOUSE_GROUP,,BUY_REGULAR,,...,,0,,,,,,,,
4,11145864,Comblain-Fairon,4180,50.446027,5.542826,HOUSE,HOUSE,499000.0,BUY_REGULAR,21.0,...,,0,,,,,1323.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16571,10795049,Aalter,9880,51.087235,3.448468,APARTMENT,APARTMENT,378000.0,BUY_REGULAR,10.0,...,,0,1.0,42.0,,,,,,
16572,11089010,Woluwe-Saint-Lambert,1200,50.840846,4.421270,APARTMENT,DUPLEX,335000.0,BUY_REGULAR,,...,0.0,0,1.0,31.0,,,,2.0,,GOOD
16573,11150113,Boutersem,3370,50.831096,4.829753,HOUSE,VILLA,499000.0,BUY_REGULAR,,...,0.0,0,1.0,,,,1528.0,4.0,,JUST_RENOVATED
16574,11153256,Laeken,1020,50.874875,4.339214,APARTMENT,APARTMENT,208000.0,BUY_REGULAR,,...,0.0,0,,,,,,,,GOOD


### Remove app in house and house in app

In [192]:
house.groupby("property_type").count()
app.groupby("property_type").count()

Unnamed: 0_level_0,property_id,locality_name,postal_code,latitude,longitude,property_subtype,price,type_of_sale,number_of_rooms,living_area,...,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_good,number_of_facades,swimming_pool,state_of_building
property_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
APARTMENT,9980,9980,9980,8874,8874,9980,9980,9980,3015,9663,...,4045,9980,6834,5330,805,635,0,6304,3893,7593
APARTMENT_GROUP,1178,1178,1178,1043,1043,1178,0,1178,0,0,...,30,1178,0,0,0,0,0,0,0,0
HOUSE,5185,5185,5185,4245,4245,5185,5185,5185,1327,4701,...,1560,5185,2697,1231,1921,1428,5185,4240,1306,3838
HOUSE_GROUP,111,111,111,93,93,111,0,111,0,0,...,1,111,0,0,0,0,0,0,0,0


In [201]:
def strip(df, columnname):
    df['columnname'] = df['columnname'].str.strip()
    return df

In [194]:
house = strip(house)
app = strip(app)

In [195]:
## ~ this symbol means that all the things like house and house_group are exclude out of the new dataframe
def remove_house_in_app(df):
    df = df[~df["property_type"].isin(["HOUSE", "HOUSE_GROUP"])]
    return df


In [196]:
def remove_app_in_house(df):
    df = df[df["property_type"].isin(["HOUSE", "HOUSE_GROUP"])]
    return df

In [197]:
house = remove_app_in_house(house)
app = remove_house_in_app(app)


### Dataframe Info

In [198]:
house.info()
app.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13904 entries, 0 to 19970
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   property_id             13904 non-null  int64  
 1   locality_name           13904 non-null  object 
 2   postal_code             13904 non-null  object 
 3   latitude                12034 non-null  float64
 4   longitude               12034 non-null  float64
 5   property_type           13904 non-null  object 
 6   property_subtype        13904 non-null  object 
 7   price                   13302 non-null  float64
 8   type_of_sale            13904 non-null  object 
 9   number_of_rooms         3962 non-null   float64
 10  living_area             12492 non-null  float64
 11  kitchen_type            8391 non-null   object 
 12  fully_equipped_kitchen  10193 non-null  float64
 13  furnished               4589 non-null   float64
 14  open_fire               13904 non-null  int

In [199]:
house.groupby("property_type").count()


Unnamed: 0_level_0,property_id,locality_name,postal_code,latitude,longitude,property_subtype,price,type_of_sale,number_of_rooms,living_area,...,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_good,number_of_facades,swimming_pool,state_of_building
property_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOUSE,13302,13302,13302,11473,11473,13302,13302,13302,3962,12492,...,4574,13302,7598,3880,5192,4349,13302,11156,4522,10515
HOUSE_GROUP,602,602,602,561,561,602,0,602,0,0,...,15,602,0,0,0,0,0,0,0,0


In [200]:
app.groupby("property_type").count()

Unnamed: 0_level_0,property_id,locality_name,postal_code,latitude,longitude,property_subtype,price,type_of_sale,number_of_rooms,living_area,...,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_good,number_of_facades,swimming_pool,state_of_building
property_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
APARTMENT,9980,9980,9980,8874,8874,9980,9980,9980,3015,9663,...,4045,9980,6834,5330,805,635,0,6304,3893,7593
APARTMENT_GROUP,1178,1178,1178,1043,1043,1178,0,1178,0,0,...,30,1178,0,0,0,0,0,0,0,0
