# Project Housing Price

## Part 4: Feature Engineering

In this notebook, you will do the following:
1. one-hot encode flat_type
2. one-hot encode town
3. one-hot encode flat_model
4. combine the DataFrames containing the one-hot encoded categorical values with the original DataFrame
5. export the large DataFrame as CSV

In [1]:
# Step 1: import your library
import pandas as pd

In [2]:
# Step 2: Load your CSV
df = pd.read_csv('housing_data_cleaned.csv')
df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,year,real_month,remaining_lease
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,1990,1,86
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,1990,1,86
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,1990,1,86
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0,1990,1,86
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,1990,1,85
...,...,...,...,...,...,...,...,...,...,...,...,...,...
846103,2021-06,YISHUN,5 ROOM,511B,YISHUN ST 51,07 TO 09,113.0,IMPROVED,2017,593000.0,2021,6,95
846104,2021-06,YISHUN,5 ROOM,505D,YISHUN ST 51,10 TO 12,113.0,IMPROVED,2016,580000.0,2021,6,93
846105,2021-06,YISHUN,EXECUTIVE,664,YISHUN AVE 4,01 TO 03,181.0,APARTMENT,1992,868000.0,2021,6,69
846106,2021-06,YISHUN,EXECUTIVE,277,YISHUN ST 22,01 TO 03,152.0,MAISONETTE,1985,585000.0,2021,6,63


In [3]:
# Step 3: Declare a variable and store the dummified/one-hot encoded values from 'flat_type'
df_flat_type = pd.get_dummies(df['flat_type'], drop_first = True)
df_flat_type

Unnamed: 0,2 ROOM,3 ROOM,4 ROOM,5 ROOM,EXECUTIVE,MULTI GENERATION
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,1,0,0,0,0
...,...,...,...,...,...,...
846103,0,0,0,1,0,0
846104,0,0,0,1,0,0
846105,0,0,0,0,1,0
846106,0,0,0,0,1,0


In [4]:
# Declare a variable and store the dummified/one-hot encoded values from 'town'
df_town = pd.get_dummies(df['town'], drop_first = True)
df_town

Unnamed: 0,BEDOK,BISHAN,BUKIT BATOK,BUKIT MERAH,BUKIT PANJANG,BUKIT TIMAH,CENTRAL AREA,CHOA CHU KANG,CLEMENTI,GEYLANG,...,PASIR RIS,PUNGGOL,QUEENSTOWN,SEMBAWANG,SENGKANG,SERANGOON,TAMPINES,TOA PAYOH,WOODLANDS,YISHUN
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
846104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
846105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
846106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
# Declare a variable and store the dummified/one-hot encoded values from 'flat_model'
df_flat_model = pd.get_dummies(df['flat_model'], drop_first = True)
df_flat_model

Unnamed: 0,ADJOINED FLAT,APARTMENT,DBSS,IMPROVED,IMPROVED-MAISONETTE,MAISONETTE,MODEL A,MODEL A-MAISONETTE,MODEL A2,MULTI GENERATION,NEW GENERATION,PREMIUM APARTMENT,PREMIUM APARTMENT LOFT,PREMIUM MAISONETTE,SIMPLIFIED,STANDARD,TERRACE,TYPE S1,TYPE S2
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846103,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
846104,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
846105,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
846106,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# Step 6: concatenate all of the DataFrames together
final_df = pd.concat([df, df_flat_model, df_flat_type, df_town], axis = 1)
final_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,...,PASIR RIS,PUNGGOL,QUEENSTOWN,SEMBAWANG,SENGKANG,SERANGOON,TAMPINES,TOA PAYOH,WOODLANDS,YISHUN
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,...,0,0,0,0,0,0,0,0,0,0
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,...,0,0,0,0,0,0,0,0,0,0
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,...,0,0,0,0,0,0,0,0,0,0
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0,...,0,0,0,0,0,0,0,0,0,0
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846103,2021-06,YISHUN,5 ROOM,511B,YISHUN ST 51,07 TO 09,113.0,IMPROVED,2017,593000.0,...,0,0,0,0,0,0,0,0,0,1
846104,2021-06,YISHUN,5 ROOM,505D,YISHUN ST 51,10 TO 12,113.0,IMPROVED,2016,580000.0,...,0,0,0,0,0,0,0,0,0,1
846105,2021-06,YISHUN,EXECUTIVE,664,YISHUN AVE 4,01 TO 03,181.0,APARTMENT,1992,868000.0,...,0,0,0,0,0,0,0,0,0,1
846106,2021-06,YISHUN,EXECUTIVE,277,YISHUN ST 22,01 TO 03,152.0,MAISONETTE,1985,585000.0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# Step 7: Export the expanded DataFrame as CSV
final_df.to_csv('housing_data_cleaned_final.csv', index = None)