In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-pastel')

In [None]:
#Import the data
data = pd.read_csv('https://raw.githubusercontent.com/altheaxcvii/project_2/charles/datasets/train.csv')

## Data Dictionary (I collapsed it because it is very long but it's here!)

| **Column names** | **Descriptions** |
|---|---|
| resale_price | the property's sale price in Singapore dollars. This is the target variable that you're trying to predict for this challenge. |
| Tranc_YearMonth | year and month of the resale transaction e.g. 2015-02 |
| town | HDB township where the flat is located e.g. BUKIT MERAH |
| flat_type | type of the resale flat unit e.g. 3 ROOM |
| block | block number of the resale flat e.g. 454 |
| street_name | street name where the resale flat resides e.g. TAMPINES ST 42 |
| storey_range | floor level (range) of the resale flat unit e.g. 07 TO 09 |
| floor_area_sqm | floor area of the resale flat unit in square metres |
| price_per_sqft | Price per Square Foot of the unit |
| flat_model | HDB model of the resale flat e.g. Multi Generation |
| lease_commence_date | commencement year of the flat units 99-year lease |
| Tranc_Year | year of resale transaction |
| Tranc_Month | month of resale transaction |
| mid_storey | median value of storey_range |
| lower | lower value of storey_range |
| 2room_rental | 2 room rental flat |
| 3room_rental | 3 room rental flat |
| 4room_rental | 4 room rental flat |
| postal | postal code |
| other_room_rental | other room rental flat |
| upper | upper value of storey_range |
| mid | middle value of storey_range |
| full_flat_type | combination of flat_type and flat_model |
| address | combination of block and street_name |
| floor_area_sqft | floor area of the resale flat unit in square feet |
| hdb_age | number of years from lease_commence_date to present year |
| max_floor_lvl | highest floor of the resale flat |
| year_completed | year which construction was completed for resale flat |
| residential | boolean value if resale flat has residential units in the same block |
| commercial | boolean value if resale flat has commercial units in the same block |
| market_hawker | boolean value if resale flat has a market or hawker centre in the same block |
| multistorey_carpark | boolean value if resale flat has a multistorey carpark in the same block |
| precinct_pavilion | boolean value if resale flat has a pavilion in the same block |
| total_dwelling_units | total number of residential dwelling units in the resale flat |
| Latitude | Latitude of the unit |
| Longitude | Longitude of the unit |
| planning_area | planning area of the unit |
| pri_sch_nearest_distance | distance of unit to the nearest primary school |
| 1room_sold | number of 1-room residential units in the resale flat |
| 2room_sold | number of 2-room residential units in the resale flat |
| 3room_sold | number of 3-room residential units in the resale flat |
| 4room_sold | number of 4-room residential units in the resale flat |
| 5room_sold | number of 5-room residential units in the resale flat |
| exec_sold | number of executive type residential units in the resale flat block |
| pri_sch_name | name of the nearest primary school |
| vacancy | vacancy of the unit |
| pri_sch_affiliation | affiliation of primary school |
| pri_sch_latitude | latitude of primary school |
| pri_sch_longitude | longitude of primary school |
| sec_sch_nearest_dist | distance to nearest secondary school |
| sec_sch_name | name of nearest secondary school |
| cutoff_point | PSLE cutoff point of nearest secondary school |
| affiliation | if there is affiliation for the nearest secondary school |
| sec_sch_latitude | latitude of secondary school |
| sec_sch_longitude | longitude of secondary school |
| multigen_sold | number of multi-generational type residential units in the resale flat block |
| mrt_nearest_distance | distance to nearest mrt |
| mrt_name | name of nearest mrt |
| bus_interchange | if there is a bus interchange |
| mrt_interchange | if there is an mrt interchange |
| mrt_latitude | latitude of mrt |
| mrt_longitude | longitude of mrt |
| bus_stop_nearest_distance | distance to nearest bus stop |
| bus_stop_name | name of bus stop |
| bus_stop_latitude | latitude of bus stop |
| bus_stop_longitude | longitude of bus stop |
| Mall_Nearest_Distance | Distance to the nearest mall |
| Mall_Within_500m | How many malls within 500m of the unit |
| Mall_Within_1km | How many malls within 1km of the unit |
| Mall_Within_2km | How many malls within 2km of the unit |
| Hawker_Nearest_Distance | Distance to nearest Hawker Center |
| Hawker_Within_500m | How many Hawker Centers within 500m of the unit |
| Hawker_Within_1km | How many Hawker Centers within 1km of the unit |
| Hawker_Within_2km | How many Hawker Centers within 2km of the unit |
| studio_apartment_sold | number of studio apartment type residential units in the resale flat block |
| 1room_rental | number of 1-room rental residential units in the resale flat block |
| hawker_food_stalls | number of stalls at nearest hawker centre |
| hawker_market_stalls | number of market stalls at nearest hawker centre |

## Data Cleaning and Preliminary EDA

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150634 entries, 0 to 150633
Data columns (total 78 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         150634 non-null  int64  
 1   Tranc_YearMonth            150634 non-null  object 
 2   town                       150634 non-null  object 
 3   flat_type                  150634 non-null  object 
 4   block                      150634 non-null  object 
 5   street_name                150634 non-null  object 
 6   storey_range               150634 non-null  object 
 7   floor_area_sqm             150634 non-null  float64
 8   flat_model                 150634 non-null  object 
 9   lease_commence_date        150634 non-null  int64  
 10  resale_price               150634 non-null  float64
 11  Tranc_Year                 150634 non-null  int64  
 12  Tranc_Month                150634 non-null  int64  
 13  mid_storey                 15

In [None]:
#standardizing all column names to be in snake case
data.columns = data.columns.str.lower().str.replace(' ','_')

In [None]:
#dropping price_per_sqft since we cannot use this for our prediction
data.drop('price_per_sqft', axis = 1, inplace = True)

In [None]:
#any columns with only 1 unique value will not be helpful in a prediction model 
for x in data.columns:
    print(x, len(data[x].unique()))

id 150634
tranc_yearmonth 110
town 26
flat_type 7
block 2514
street_name 553
storey_range 25
floor_area_sqm 169
flat_model 20
lease_commence_date 54
resale_price 3295
tranc_year 10
tranc_month 12
mid_storey 22
lower 22
upper 23
mid 22
full_flat_type 43
address 9157
floor_area_sqft 169
hdb_age 54
max_floor_lvl 41
year_completed 59
residential 1
commercial 2
market_hawker 2
multistorey_carpark 2
precinct_pavilion 2
total_dwelling_units 320
1room_sold 2
2room_sold 99
3room_sold 255
4room_sold 200
5room_sold 133
exec_sold 103
multigen_sold 7
studio_apartment_sold 61
1room_rental 11
2room_rental 48
3room_rental 15
other_room_rental 3
postal 17483
latitude 9126
longitude 9098
planning_area 32
mall_nearest_distance 8983
mall_within_500m 7
mall_within_1km 16
mall_within_2km 42
hawker_nearest_distance 9126
hawker_within_500m 6
hawker_within_1km 10
hawker_within_2km 20
hawker_food_stalls 52
hawker_market_stalls 70
mrt_nearest_distance 9126
mrt_name 94
bus_interchange 2
mrt_interchange 2
mrt_lati

In [None]:
#Dropping residential column due to only 1 unique value and id value as it is only use for identification
data.drop(['residential', 'id'], axis = 1, inplace=True)

In [None]:
data.columns

Index(['tranc_yearmonth', 'town', 'flat_type', 'block', 'street_name',
       'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
       'resale_price', 'tranc_year', 'tranc_month', 'mid_storey', 'lower',
       'upper', 'mid', 'full_flat_type', 'address', 'floor_area_sqft',
       'hdb_age', 'max_floor_lvl', 'year_completed', 'commercial',
       'market_hawker', 'multistorey_carpark', 'precinct_pavilion',
       'total_dwelling_units', '1room_sold', '2room_sold', '3room_sold',
       '4room_sold', '5room_sold', 'exec_sold', 'multigen_sold',
       'studio_apartment_sold', '1room_rental', '2room_rental', '3room_rental',
       'other_room_rental', 'postal', 'latitude', 'longitude', 'planning_area',
       'mall_nearest_distance', 'mall_within_500m', 'mall_within_1km',
       'mall_within_2km', 'hawker_nearest_distance', 'hawker_within_500m',
       'hawker_within_1km', 'hawker_within_2km', 'hawker_food_stalls',
       'hawker_market_stalls', 'mrt_nearest_distance', 

In [None]:
data.isnull().sum().sort_values().tail(8)

max_floor_lvl                0
mall_nearest_distance      829
mall_within_2km           1940
mall_within_1km          25426
hawker_within_2km        29202
hawker_within_1km        60868
mall_within_500m         92789
hawker_within_500m       97390
dtype: int64

In [None]:
#Exploring Null Rows - looks like null value are 0 (i.e. no malls within stated distance)
data[data['mall_nearest_distance'].isnull()][['mall_nearest_distance', 'mall_within_500m', 'mall_within_1km', 'mall_within_2km']]

Unnamed: 0,mall_nearest_distance,mall_within_500m,mall_within_1km,mall_within_2km
75,,,,
321,,,,
478,,,,
643,,,,
691,,,,
...,...,...,...,...
150296,,,,
150388,,,,
150394,,,,
150479,,,,


In [None]:
#filling null values with 0
col_with_null = data.columns[data.isnull().sum() != 0].to_list()
imputer=SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value=0)
for x in col_with_null:
    data[x]=imputer.fit_transform(data[x].values.reshape(-1,1))

In [None]:
#Since machine learning can only take numeric inputs, we will have to convert useful categorical columns to numeric form 
data.select_dtypes(include='object').columns

Index(['tranc_yearmonth', 'town', 'flat_type', 'block', 'street_name',
       'storey_range', 'flat_model', 'full_flat_type', 'address', 'commercial',
       'market_hawker', 'multistorey_carpark', 'precinct_pavilion', 'postal',
       'planning_area', 'mrt_name', 'bus_stop_name', 'pri_sch_name',
       'sec_sch_name'],
      dtype='object')

In [None]:
#Checking if tranc_yearmonth is consistent with 'tranc_year', 'tranc_month'
data[['tranc_yearmonth','tranc_year', 'tranc_month']]

Unnamed: 0,tranc_yearmonth,tranc_year,tranc_month
0,2016-05,2016,5
1,2012-07,2012,7
2,2013-07,2013,7
3,2012-04,2012,4
4,2017-12,2017,12
...,...,...,...
150629,2020-09,2020,9
150630,2017-06,2017,6
150631,2020-12,2020,12
150632,2016-05,2016,5


In [None]:
#drop trancyear
data.drop('tranc_yearmonth', axis = 1, inplace=True)

In [None]:
#checking if storey_range column is consistent with 'mid_storey', 'lower', 'upper', 'mid'
data[['storey_range','mid_storey', 'lower', 'upper', 'mid']]

Unnamed: 0,storey_range,mid_storey,lower,upper,mid
0,10 TO 12,11,10,12,11
1,07 TO 09,8,7,9,8
2,13 TO 15,14,13,15,14
3,01 TO 05,3,1,5,3
4,01 TO 03,2,1,3,2
...,...,...,...,...,...
150629,04 TO 06,5,4,6,5
150630,04 TO 06,5,4,6,5
150631,10 TO 12,11,10,12,11
150632,07 TO 09,8,7,9,8


In [None]:
#Checking if mid_storey == mid columns
(data['mid_storey'] == data['mid']).value_counts()

True    150634
dtype: int64

In [None]:
#drop storey_range and mid_storey
data.drop(['storey_range','mid_storey'], axis = 1, inplace=True)

In [None]:
#checking if town == planning_area 
data[['town', 'planning_area']]

Unnamed: 0,town,planning_area
0,KALLANG/WHAMPOA,Kallang
1,BISHAN,Bishan
2,BUKIT BATOK,Bukit Batok
3,BISHAN,Bishan
4,YISHUN,Yishun
...,...,...
150629,WOODLANDS,Woodlands
150630,JURONG WEST,Jurong West
150631,BEDOK,Bedok
150632,QUEENSTOWN,Queenstown


In [None]:
data['town'] = data['town'].str.lower()
data['planning_area'] = data['planning_area'].str.lower()

In [None]:
data[['town', 'planning_area']]

Unnamed: 0,town,planning_area
0,kallang/whampoa,kallang
1,bishan,bishan
2,bukit batok,bukit batok
3,bishan,bishan
4,yishun,yishun
...,...,...
150629,woodlands,woodlands
150630,jurong west,jurong west
150631,bedok,bedok
150632,queenstown,queenstown


In [None]:
(data['town'] == data['planning_area']).value_counts()

True     144931
False      5703
dtype: int64

In [None]:
data[(data['town'] == data['planning_area']) == False][['town', 'planning_area']]

Unnamed: 0,town,planning_area
0,kallang/whampoa,kallang
32,kallang/whampoa,novena
35,kallang/whampoa,kallang
55,kallang/whampoa,kallang
88,central area,rochor
...,...,...
150507,kallang/whampoa,kallang
150510,kallang/whampoa,kallang
150511,kallang/whampoa,kallang
150524,central area,outram


In [None]:
#Planning area is more precise than town so we will drop town since most values are repeated
data.drop('town', axis = 1, inplace=True)

In [None]:
#checking if flat_type and full_flat_type is consistent
(data['flat_type'] == data['full_flat_type']).value_counts()

False    150634
dtype: int64

In [None]:
data[['flat_type', 'full_flat_type', 'flat_model']]

Unnamed: 0,flat_type,full_flat_type,flat_model
0,4 ROOM,4 ROOM Model A,Model A
1,5 ROOM,5 ROOM Improved,Improved
2,EXECUTIVE,EXECUTIVE Apartment,Apartment
3,4 ROOM,4 ROOM Model A,Model A
4,4 ROOM,4 ROOM Simplified,Simplified
...,...,...,...
150629,EXECUTIVE,EXECUTIVE Apartment,Apartment
150630,5 ROOM,5 ROOM Improved,Improved
150631,EXECUTIVE,EXECUTIVE Apartment,Apartment
150632,3 ROOM,3 ROOM Improved,Improved


In [None]:
#drop full_flat_type 
data.drop('full_flat_type', axis = 1, inplace = True)

In [None]:
#checking remaining object dtype columns and the number of values
for x in data.select_dtypes(include='object').columns:
    print(x, len(data[x].unique()))

flat_type 7
block 2514
street_name 553
flat_model 20
address 9157
commercial 2
market_hawker 2
multistorey_carpark 2
precinct_pavilion 2
postal 17483
planning_area 32
mrt_name 94
bus_stop_name 1657
pri_sch_name 177
sec_sch_name 134


In [None]:
#dropping block, street_name, address, postal and bus_stop_name as there are too many unique values 
data.drop(['block', 'street_name', 'address', 'postal', 'bus_stop_name'], axis = 1, inplace = True)

In [None]:
#Checking remaining columns
data.columns

Index(['flat_type', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
       'resale_price', 'tranc_year', 'tranc_month', 'lower', 'upper', 'mid',
       'floor_area_sqft', 'hdb_age', 'max_floor_lvl', 'year_completed',
       'commercial', 'market_hawker', 'multistorey_carpark',
       'precinct_pavilion', 'total_dwelling_units', '1room_sold', '2room_sold',
       '3room_sold', '4room_sold', '5room_sold', 'exec_sold', 'multigen_sold',
       'studio_apartment_sold', '1room_rental', '2room_rental', '3room_rental',
       'other_room_rental', 'latitude', 'longitude', 'planning_area',
       'mall_nearest_distance', 'mall_within_500m', 'mall_within_1km',
       'mall_within_2km', 'hawker_nearest_distance', 'hawker_within_500m',
       'hawker_within_1km', 'hawker_within_2km', 'hawker_food_stalls',
       'hawker_market_stalls', 'mrt_nearest_distance', 'mrt_name',
       'bus_interchange', 'mrt_interchange', 'mrt_latitude', 'mrt_longitude',
       'bus_stop_nearest_distance', 'bus_sto

In [None]:
#dropping longitude and latitude columns
data.drop(['latitude', 'longitude', 'mrt_latitude', 'mrt_longitude', 'bus_stop_latitude', 'bus_stop_longitude', 'pri_sch_latitude', 'pri_sch_longitude', 'sec_sch_latitude', 'sec_sch_longitude'], axis = 1, inplace = True)

In [None]:
#droppping floor_area_sqft becauses we have floor_area_sqm
data.drop('floor_area_sqft', axis = 1, inplace = True)

In [None]:
data.columns

Index(['flat_type', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
       'resale_price', 'tranc_year', 'tranc_month', 'lower', 'upper', 'mid',
       'hdb_age', 'max_floor_lvl', 'year_completed', 'commercial',
       'market_hawker', 'multistorey_carpark', 'precinct_pavilion',
       'total_dwelling_units', '1room_sold', '2room_sold', '3room_sold',
       '4room_sold', '5room_sold', 'exec_sold', 'multigen_sold',
       'studio_apartment_sold', '1room_rental', '2room_rental', '3room_rental',
       'other_room_rental', 'planning_area', 'mall_nearest_distance',
       'mall_within_500m', 'mall_within_1km', 'mall_within_2km',
       'hawker_nearest_distance', 'hawker_within_500m', 'hawker_within_1km',
       'hawker_within_2km', 'hawker_food_stalls', 'hawker_market_stalls',
       'mrt_nearest_distance', 'mrt_name', 'bus_interchange',
       'mrt_interchange', 'bus_stop_nearest_distance',
       'pri_sch_nearest_distance', 'pri_sch_name', 'vacancy',
       'pri_sch_affiliation', 

In [None]:
#dropping lease_commence_date and year_completed because we have hdb_age
data.drop(['lease_commence_date','year_completed'], axis = 1, inplace = True)

In [None]:
data.select_dtypes(include='object').columns

Index(['flat_type', 'flat_model', 'commercial', 'market_hawker',
       'multistorey_carpark', 'precinct_pavilion', 'planning_area', 'mrt_name',
       'pri_sch_name', 'sec_sch_name'],
      dtype='object')

In [None]:
data.select_dtypes(include=('int64', 'float64')).columns

Index(['floor_area_sqm', 'resale_price', 'tranc_year', 'tranc_month', 'lower',
       'upper', 'mid', 'hdb_age', 'max_floor_lvl', 'total_dwelling_units',
       '1room_sold', '2room_sold', '3room_sold', '4room_sold', '5room_sold',
       'exec_sold', 'multigen_sold', 'studio_apartment_sold', '1room_rental',
       '2room_rental', '3room_rental', 'other_room_rental',
       'mall_nearest_distance', 'mall_within_500m', 'mall_within_1km',
       'mall_within_2km', 'hawker_nearest_distance', 'hawker_within_500m',
       'hawker_within_1km', 'hawker_within_2km', 'hawker_food_stalls',
       'hawker_market_stalls', 'mrt_nearest_distance', 'bus_interchange',
       'mrt_interchange', 'bus_stop_nearest_distance',
       'pri_sch_nearest_distance', 'vacancy', 'pri_sch_affiliation',
       'sec_sch_nearest_dist', 'cutoff_point', 'affiliation'],
      dtype='object')

### cleaning stops here

-------------------

# EDA and Features Selection
<small>Additional Guidelines for Project2:<br>
Please sign up for the project 2 kaggle competition. You will then be able to upload your predictions for the kaggle test set and get your scores.<br>
Do not use any library such as “pandas-profiling” that automate the EDA process for you. We want you to be familiar with going through EDA steps on your own.<br>
Do not include the feature “price_per_sqft” in your models as this would result in target leakage.<br>
Do not use any imputation methods that are more complex than regression or KNN imputation<br>
Do not use advanced models such as neural networks, xgboost, etc. Based on the README.md file, you would need to have 3 models (linear regression, lasso, and ridge)<br>
Do try to limit the number of features in your final model to <30 features if possible.<br>
Ensure each member has enough content to cover and time to speak

### Codes

In [None]:
# Define a function to compute correlations and plot the graph
def compute_correlations_and_plot(df, category_name):
    # Compute correlations
    correlations = df.corr()['resale_price'].drop('resale_price')

    # Sort the correlations
    sorted_correlations = correlations.sort_values(ascending=False)

    # Plot the correlations
    plt.figure(figsize=(10, 6))
    sns.barplot(x=sorted_correlations.index, y=sorted_correlations.values)
    plt.xticks(rotation=90)
    plt.xlabel('Columns')
    plt.ylabel('Correlation with Resale Price')
    plt.title(f'Correlations with Resale Price for {category_name}')
    plt.tight_layout()
    plt.show()




# Compute correlations and plot graphs for each category
compute_correlations_and_plot(property_details_df, 'Property Details')
compute_correlations_and_plot(sold_rental_units_df, 'Sold and Rental Units')
compute_correlations_and_plot(location_df, 'Location')
compute_correlations_and_plot(facilities_df, 'Facilities')


NameError: ignored

In [None]:

# Plot the correlations
plt.figure(figsize=(10, 6))
barplot = sns.barplot(x=sorted_correlations.index, y=sorted_correlations.values)

# Rotate x-axis labels for better visibility
plt.xticks(rotation=90)

# Add correlation values on top of each bar
for i, value in enumerate(sorted_correlations.values):
    barplot.text(i, value, round(value, 3), ha = 'center', va='bottom', rotation=90)

plt.xlabel('Columns')
plt.ylabel('Correlation with Resale Price')
plt.title('Correlations with Resale Price')
plt.tight_layout()
plt.show()

Now, we want to split the columns based on the categories that they fal under

In [None]:
# Define the column categories
property_details_cols = ['Tranc_YearMonth', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date', 'Tranc_Year', 'Tranc_Month', 'mid_storey', 'lower', 'upper', 'mid', 'full_flat_type', 'address', 'floor_area_sqft', 'price_per_sqft', 'hdb_age', 'max_floor_lvl', 'year_completed', 'residential', 'commercial', 'market_hawker', 'multistorey_carpark', 'precinct_pavilion', 'total_dwelling_units']

sold_rental_units_cols = ['1room_sold', '2room_sold', '3room_sold', '4room_sold', '5room_sold', 'exec_sold', 'multigen_sold', 'studio_apartment_sold', '1room_rental', '2room_rental', '3room_rental', 'other_room_rental']

location_cols = ['postal', 'Latitude', 'Longitude', 'planning_area']

schools_cols = ['pri_sch_nearest_distance', 'pri_sch_name', 'vacancy', 'pri_sch_affiliation', 'pri_sch_latitude', 'pri_sch_longitude', 'sec_sch_nearest_dist', 'sec_sch_name', 'cutoff_point', 'affiliation', 'sec_sch_latitude', 'sec_sch_longitude']

mrt_and_bus_cols = ['mrt_nearest_distance', 'mrt_name', 'bus_interchange', 'mrt_interchange', 'mrt_latitude', 'mrt_longitude', 'bus_stop_nearest_distance', 'bus_stop_name', 'bus_stop_latitude', 'bus_stop_longitude']

facilities_cols = ['Mall_Nearest_Distance', 'Mall_Within_500m', 'Mall_Within_1km', 'Mall_Within_2km', 'Hawker_Nearest_Distance', 'Hawker_Within_500m', 'Hawker_Within_1km', 'Hawker_Within_2km', 'hawker_food_stalls', 'hawker_market_stalls']


# Create a dictionary where keys are column categories and values are lists of column names
column_categories = {
    'Property Details': property_details_cols,
    'Sold and Rental Units': sold_rental_units_cols,
    'Location': location_cols,
    'Schools': schools_cols,
    'MRT and Bus': mrt_and_bus_cols,
    'Facilities': facilities_cols
}


### Creating a dataframe with all the descriptions of the data called `column_info_df`

In [None]:
# Create a dictionary with column names as keys and data descriptions as values
data_descriptions = {
    'resale_price': "the property's sale price in Singapore dollars. This is the target variable that you're trying to predict for this challenge.",
    'Tranc_YearMonth': 'year and month of the resale transaction, e.g. 2015-02',
    'town': 'HDB township where the flat is located, e.g. BUKIT MERAH',
    'flat_type': 'type of the resale flat unit, e.g. 3 ROOM',
    'block': 'block number of the resale flat, e.g. 454',
    'street_name': 'street name where the resale flat resides, e.g. TAMPINES ST 42',
    'storey_range': 'floor level (range) of the resale flat unit, e.g. 07 TO 09',
    'floor_area_sqm': 'floor area of the resale flat unit in square metres',
    'price_per_sqft': 'Price per Square Foot of the unit',
    'flat_model': 'HDB model of the resale flat, e.g. Multi Generation',
    'lease_commence_date': "commencement year of the flat unit's 99-year lease",
    'Tranc_Year': 'year of resale transaction',
    'Tranc_Month': 'month of resale transaction',
    'mid_storey': 'median value of storey_range',
    'lower': 'lower value of storey_range',
    '2room_rental': '2 room rental flat',
    '3room_rental': '3 room rental flat',
    '4room_rental': '4 room rental flat',
    'postal':'postal code',
    'other_room_rental': 'other room rental flat',
    'upper': 'upper value of storey_range',
    'mid': 'middle value of storey_range',
    'full_flat_type': 'combination of flat_type and flat_model',
    'address': 'combination of block and street_name',
    'floor_area_sqft': 'floor area of the resale flat unit in square feet',
    'hdb_age': 'number of years from lease_commence_date to present year',
    'max_floor_lvl': 'highest floor of the resale flat',
    'year_completed': 'year which construction was completed for resale flat',
    'residential': 'boolean value if resale flat has residential units in the same block',
    'commercial': 'boolean value if resale flat has commercial units in the same block',
    'market_hawker': 'boolean value if resale flat has a market or hawker centre in the same block',
    'multistorey_carpark': 'boolean value if resale flat has a multistorey carpark in the same block',
    'precinct_pavilion': 'boolean value if resale flat has a pavilion in the same block',
    'total_dwelling_units': 'total number of residential dwelling units in the resale flat',
    'Latitude':'Latitude of the unit',
    'Longitude':'Longitude of the unit',
    'planning_area':'planning area of the unit',
    'pri_sch_nearest_distance':'distance of unit to the nearest primary school',
    '1room_sold': 'number of 1-room residential units in the resale flat',
    '2room_sold': 'number of 2-room residential units in the resale flat',
    '3room_sold': 'number of 3-room residential units in the resale flat',
    '4room_sold': 'number of 4-room residential units in the resale flat',
    '5room_sold': 'number of 5-room residential units in the resale flat',
    'exec_sold': 'number of executive type residential units in the resale flat block',
    'pri_sch_name':'name of the nearest primary school',
    'vacancy':'vacancy of the unit',
    'pri_sch_affiliation':'affiliation of primary school',
    'pri_sch_latitude':'latitude of primary school',
    'pri_sch_longitude':'longitude of primary school',
    'sec_sch_nearest_dist':'distance to nearest secondary school',
    'sec_sch_name':'name of nearest secondary school',
    'cutoff_point':'PSLE cutoff point of nearest secondary school',
    'affiliation':'if there is affiliation for the nearest secondary school',
    'sec_sch_latitude':'latitude of secondary school',
    'sec_sch_longitude':'longitude of secondary school',
    'multigen_sold': 'number of multi-generational type residential units in the resale flat block',
    'mrt_nearest_distance':'distance to nearest mrt',
    'mrt_name':'name of nearest mrt',
    'bus_interchange':'if there is a bus interchange',
    'mrt_interchange':'if there is an mrt interchange',
    'mrt_latitude':'latitude of mrt',
    'mrt_longitude':'longitude of mrt',
    'bus_stop_nearest_distance':'distance to nearest bus stop',
    'bus_stop_name':'name of bus stop',
    'bus_stop_latitude':'latitude of bus stop',
    'bus_stop_longitude':'longitude of bus stop',
    'Mall_Nearest_Distance':'Distance to the nearest mall',
    'Mall_Within_500m':'How many malls within 500m of the unit',
    'Mall_Within_1km':'How many malls within 1km of the unit',
    'Mall_Within_2km':'How many malls within 2km of the unit',
    'Hawker_Nearest_Distance':'Distance to nearest Hawker Center',
    'Hawker_Within_500m':'How many Hawker Centers within 500m of the unit',
    'Hawker_Within_1km':'How many Hawker Centers within 1km of the unit',
    'Hawker_Within_2km':'How many Hawker Centers within 2km of the unit',
    'studio_apartment_sold': 'number of studio apartment type residential units in the resale flat block',
    '1room_rental':'number of 1-room rental residential units in the resale flat block', 
    'hawker_food_stalls':'number of stalls at nearest hawker centre',
    'hawker_market_stalls':'number of market stalls at nearest hawker centre',
    
}


# Create a dictionary that maps each column to its category and description
column_info = {}

for category, columns in column_categories.items():
    for column in columns:
        column_info[column] = {'Category': category, 'Description': data_descriptions[column]}

# Convert the dictionary to a dataframe
column_info_df = pd.DataFrame.from_dict(column_info, orient='index')
# Reset the index to make 'Column' a column instead of an index
column_info_df.reset_index(inplace=True)
# Rename the 'index' column to 'Column'
column_info_df.rename(columns={'index': 'Column'}, inplace=True)

# Print the dataframe
print(column_info_df)

In [None]:
# Create a dataframe from the dictionary
df_column_categories = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in column_categories.items() ]))

# List of columns to be dropped
dropped_cols = ['residential', 'id', 'tranc_yearmonth', 'storey_range', 'mid_storey', 'town', 'flat_type', 
                'block', 'street_name', 'address', 'postal', 'bus_stop_name', 'price_per_sqft', 'latitude', 
                'longitude', 'mrt_latitude', 'mrt_longitude', 'bus_stop_latitude', 'bus_stop_longitude', 
                'pri_sch_latitude', 'pri_sch_longitude', 'sec_sch_latitude', 'sec_sch_longitude']

# Remove these columns from your column lists if they exist
property_details_cols = [col for col in property_details_cols if col not in dropped_cols]
sold_rental_units_cols = [col for col in sold_rental_units_cols if col not in dropped_cols]
location_cols = [col for col in location_cols if col not in dropped_cols]
schools_cols = [col for col in schools_cols if col not in dropped_cols]
mrt_and_bus_cols = [col for col in mrt_and_bus_cols if col not in dropped_cols]
facilities_cols = [col for col in facilities_cols if col not in dropped_cols]



# Add 'resale_price' to each category's list of columns
property_details_cols.append('resale_price')
sold_rental_units_cols.append('resale_price')
location_cols.append('resale_price')
schools_cols.append('resale_price')
mrt_and_bus_cols.append('resale_price')
facilities_cols.append('resale_price')

# Create dataframes for each category
property_details_df = data[property_details_cols]
sold_rental_units_df = data[sold_rental_units_cols]
location_df = data[location_cols]
schools_df = data[schools_cols]
mrt_and_bus_df = data[mrt_and_bus_cols]
facilities_df = data[facilities_cols]

# Define a function to compute correlations and plot the graph
def compute_correlations_and_plot(df, category_name):
    # Compute correlations
    correlations = df.corr()['resale_price'].drop('resale_price')

    # Sort the correlations
    sorted_correlations = correlations.sort_values(ascending=False)

    # Plot the correlations
    plt.figure(figsize=(10, 6))
    sns.barplot(x=sorted_correlations.index, y=sorted_correlations.values)
    plt.xticks(rotation=90)
    plt.xlabel('Columns')
    plt.ylabel('Correlation with Resale Price')
    plt.title(f'Correlations with Resale Price for {category_name}')
    plt.tight_layout()
    plt.show()

# Compute correlations and plot graphs for each category
compute_correlations_and_plot(property_details_df, 'Property Details')
compute_correlations_and_plot(sold_rental_units_df, 'Sold and Rental Units')
compute_correlations_and_plot(location_df, 'Location')
compute_correlations_and_plot(schools_df, 'Schools')
compute_correlations_and_plot(mrt_and_bus_df, 'MRT and Bus')
compute_correlations_and_plot(facilities_df, 'Facilities')

In [None]:
avg_resale_price = data.groupby('pri_sch_name')['resale_price'].mean().reset_index()

In [None]:
# Create a new column 'pri_sch_price_category' based on the average resale price
avg_resale_price['pri_sch_price_category'] = pd.qcut(avg_resale_price['resale_price'], 3, labels=[1, 2, 3])

# Merge this DataFrame with the original DataFrame to create the new column
data = pd.merge(data, avg_resale_price[['pri_sch_name', 'pri_sch_price_category']], on='pri_sch_name', how='left')
print(data['pri_sch_price_category'].unique())


In [None]:
for x in data.select_dtypes(include='object').columns:
    print(x, len(data[x].unique()))

Columns to get_dummies:

In [None]:
col = ['flat_type','flat_model','market_hawker','commercial','precinct_pavilion','planning_area','mrt_name','town']

In [None]:
encoded_data = pd.get_dummies(data, columns=col, drop_first=True)

remove street_name, address, blocks, town, bus stop

In [None]:
# Identify the columns with the object datatype
object_cols = encoded_data.select_dtypes(include=['object']).columns

# Drop these columns from the DataFrame
encoded_data = encoded_data.drop(object_cols, axis=1)

# Convert the column names to a DataFrame
columns_df = pd.DataFrame(encoded_data.columns, columns=['Column Name'])

# Create a new column in the DataFrame for the number of unique values in each column
columns_df['Unique Values'] = columns_df['Column Name'].apply(lambda col: encoded_data[col].nunique())

# Print the DataFrame
print(columns_df)


In [None]:
selected_col = encoded_data.columns.tolist()
print(selected_col)

In [None]:
encoded_data_new = encoded_data[selected_col]
X = encoded_data_new
y = data[['resale_price']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)


In [None]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.intercept_)
print(lr.coef_)
print('Linear Regression score:',lr.score(X_train,y_train))
lrpred=lr.predict(X_test)
print('Mean Squared Error: ',mean_squared_error(y_test,lrpred))
print('Mean Absolute Error: ',mean_absolute_error(y_test,lrpred))
print('Root Mean Squared Error: ',np.sqrt(mean_squared_error(y_test,lrpred)))
print('Test Score: ',lr.score(X_test, y_test))

In [None]:
print("R2 Score:", r2_score(y_test, lrpred))
print("Train Score:", lr.score(X_train, y_train))
print("Test Score:", lr.score(X_test, y_test))
print("Cross Validation Score:", cross_val_score(lr, X_train, y_train, cv=20).mean())

EDA for Nearest Sec School Columns

In [None]:
#By Sec Sch Name
sec_sch = data.pivot_table(index='sec_sch_name', values='resale_price', aggfunc=(len, np.mean, np.median, np.std))
sec_sch.sort_values(by='std')

In [None]:
sec_sch_list = []
for x in data['sec_sch_name'].unique():
    price = data[data['sec_sch_name'] == x]['resale_price'].to_list()
    sec_sch_list.append(price)
plt.figure(figsize = (20, 48))
plt.boxplot(sec_sch_list, vert = False)
locs, labels = plt.yticks()
plt.yticks(locs, data['sec_sch_name'].unique().tolist())
plt.xlabel('Price (SGD$ million)')
plt.ylabel('Sec Sch Name')
plt.title('Resale Price vs Nearest Sec Sch')

In [None]:
#By CutOff Point
data.pivot_table(index='cutoff_point', values='resale_price', aggfunc=(len, np.mean, np.median, np.std))

In [None]:
plt.scatter(y = data.pivot_table(index='cutoff_point', values='resale_price', aggfunc=(np.mean)).iloc[:,0], x = data.pivot_table(index='cutoff_point', values='resale_price', aggfunc=(len, np.mean, np.median, np.std)).index)
plt.title('Mean Resale Price vs Cutoff Point of the Nearest Secondary School')
plt.xlabel('Cutoff Point')
plt.ylabel('Mean Resale Price (SGD$)')
plt.hlines(510000, xmin = 190, xmax = 260, linestyles= 'dashed', colors='lightpink')
plt.vlines(240, ymin=350000, ymax=650000, linestyles='dashed', colors='lightpink')
plt.hlines(350000, xmin = 190, xmax = 260, linestyles= 'dashed', colors='lightpink')

In [None]:
#Separate all the cutoff into 5 groups based on this cutting, may improve accuracy idk

In [None]:
plt.scatter(y = data.pivot_table(index='cutoff_point', values='resale_price', aggfunc=(np.median)).iloc[:,0], x = data.pivot_table(index='cutoff_point', values='resale_price', aggfunc=(len, np.mean, np.median, np.std)).index)
plt.title('Median Resale Price vs Cutoff Point of the Nearest Secondary School')
plt.xlabel('Cutoff Point')
plt.ylabel('Median Resale Price (SGD$)')
plt.hlines(510000, xmin = 190, xmax = 260, linestyles= 'dashed', colors='lightpink')
plt.vlines(240, ymin=350000, ymax=650000, linestyles='dashed', colors='lightpink')
plt.hlines(350000, xmin = 190, xmax = 260, linestyles= 'dashed', colors='lightpink')

In [None]:
np.corrcoef(data['cutoff_point'], data['resale_price']) 

In [None]:
data.pivot_table(index='affiliation', values='resale_price', aggfunc=(len, np.mean, np.median, np.std))

In [None]:
affiliation_list = []
for x in data['affiliation'].unique():
    price = data[data['affiliation'] == x]['resale_price'].to_list()
    affiliation_list.append(price)
plt.boxplot(affiliation_list)
locs, labels = plt.xticks()
plt.xticks(locs, ['No', 'Yes'])
plt.ylabel('Price (SGD$ million)')
plt.xlabel('Nearest Sec School - Affiliated')
plt.title('Resale Price vs Sec Sch Affiliation')

Can drop affiliation column since there is no significant differences between with or without aff

### kbest code

In [None]:
# Assuming X is your feature matrix and y is your target variable
X = columns of potential predictors
y = data[['resale_price']]

# Initialize SelectKBest with f_regression scoring function and select top 30 features
selector = SelectKBest(score_func=f_regression, k=30)

# Fit SelectKBest to your data
selector.fit(X, y)

# Retrieve selected features
X_selected = selector.transform(X)
selected_indices = selector.get_support(indices=True)
selected_columns = X.columns[selector.get_support()]

In [None]:
# print X_selected -> names of predictors
X_selected = X.iloc[:, selected_indices].columns
X_selected