In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV,RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error, make_scorer, mean_absolute_percentage_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn import set_config; set_config(display='diagram')
#import geopandas as gpd

%load_ext autoreload
%autoreload 2

In [41]:
from data_cleaning import clean_data

In [46]:
data = pd.read_csv('housing_data_full.csv')
data.shape

(26304, 42)

In [47]:
data = clean_data(data)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [5]:
data.columns

Index(['Listing_Id', 'Price', 'Size', 'N_Bedrooms', 'N_Bathrooms', 'Type',
       'Available From', 'Agency Commission', 'Floor', 'Furnished',
       'Main Window Facing', 'District', 'Area', 'Compound', 'Metro',
       'Latitude', 'Longtitude', 'Agent', 'Description', 'First_post',
       'Refresh', 'Balcony', 'Landlord lives in Shanghai', 'Oven',
       'Recently renovated', 'Air Filter', 'English Speaking Landlord',
       'Fitness Centers', 'Floor Heating', 'Garden', 'Historic Building',
       'Large Storage Room', 'Parking', 'Playground', 'Pool', 'Tennis Courts',
       'Wall heating', 'Water Filter', 'Pets_allowed'],
      dtype='object')

In [13]:
data['District'].value_counts()

Jing'an      7156
Xuhui        7006
Changning    5888
Huangpu      3415
Putuo        1241
Pudong        937
Minhang       165
Hongkou        86
Qingpu         64
-              38
Baoshan        35
Songjiang      18
Yangpu          6
Jiading         3
Chongming       1
Name: District, dtype: int64

In [12]:
data['Area'].value_counts()

-                           11838
Old Xuhui                    5293
Jing'an Temple               2571
Zhongshan Park               2294
Xujiahui                      915
Gubei                         913
Nanjing Xi Lu                 839
People's Square               601
Lujiazui                      224
Laoximen                      204
Hongqiao CBD                  121
Century Park                  106
West Bund                      37
Nanjing Dong Lu                28
Huaihai Lu                     23
Bund Area                      21
Jinqiao                        18
North Bund                     10
Hongkou Football Stadium        3
Name: Area, dtype: int64

#This is to draw the geometry map

downtown = gpd.read_file('sh-towns.geojson')

data = gpd.GeoDataFrame(data=data, geometry=gpd.points_from_xy(data.Longtitude,data.Latitude))

ax = downtown['geometry'].plot(figsize=(10,10), color='none', edgecolor='gainsboro', zorder=3);

data['geometry'].plot(markersize=1,ax=ax)

ax.set_xlim((121,121.8))

ax.set_ylim((31,31.5))

# Feature Engineering

In [64]:
y = data['Price']
X = data.drop(columns=['Price','Available From','First_post','Agency Commission'])

In [65]:
num_transformer = Pipeline([
    ('standard_scaler', StandardScaler())
])

oe_transformer = OrdinalEncoder(categories=[['Unfurnished','Furnished']])

ohe_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer,X.select_dtypes(include='number').columns),
    ('oe_transformer', oe_transformer, ['Furnished']),
    ('ohe_transformer', ohe_transformer,['Metro','District'])],
    remainder='passthrough'
)
preprocessor

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [67]:
X_train_new = preprocessor.fit_transform(X_train)

In [68]:
X_test_new = preprocessor.transform(X_test)

In [69]:
rmse = make_scorer(mean_squared_log_error,greater_is_better=True, squared = False)

# Base Models

In [70]:
base_model = DecisionTreeRegressor()
results = cross_validate(base_model, X_train_new, y_train, cv=5, scoring=rmse)
results['test_score'].mean()

0.17777759756209197

In [71]:
base_model.fit(X_train_new, y_train)

In [72]:
y_base = base_model.predict(X_test_new)
mean_absolute_error(y_test,y_base)

1530.077422066294

In [18]:
model_RFR = RandomForestRegressor()
results = cross_validate(model_RFR, X_train_new, y_train, cv=5, scoring=rmse)
results['test_score'].mean()

0.13964284410908512

In [19]:
model_RFR.fit(X_train_new, y_train)

In [20]:
y_RFR = model_RFR.predict(X_test_new)
mean_absolute_error(y_test,y_RFR)

1364.4092781205463

In [21]:
model_XGB = XGBRegressor(n_estimators=100,
                         max_depth=4,
                        learning_rate=0.1)
results = cross_validate(model_XGB, X_train_new, y_train, cv=5, scoring=rmse)
results['test_score'].mean()

0.17210739127288283

In [22]:
model_XGB.fit(X_train_new, y_train)
y_pred = model_XGB.predict(X_test_new)

In [23]:
mean_absolute_error(y_test,y_pred)

2086.1630322877695

In [77]:
model_tuning = RandomForestRegressor()

grid = {
    'n_estimators': [100,150,200], 
    'max_depth': [7, 10, 15],
    'learning_rate': [0.01, 0.02]
}

search = RandomizedSearchCV(
    model_XGB,
    grid, 
    scoring = rmse,
    cv = 5,
    n_jobs=-1
) 

search.fit(X_train_new, y_train)

In [84]:
search.best_params_

{'n_estimators': 150, 'max_depth': 15, 'learning_rate': 0.001}

In [78]:
search.best_score_

1.9695969556441715

In [79]:
model=search.best_estimator_

In [80]:
y_pred = model.predict(X_test_new)

In [81]:
mean_absolute_error(y_test,y_pred)

13670.808474704352

In [82]:
y_pred

array([2668.069 , 4792.544 , 4309.6104, ..., 3691.6658, 3447.169 ,
       1118.6322], dtype=float32)

In [83]:
y_test

15150    22800
2769     32000
5624     22000
20165     6300
20753    27000
         ...  
9966      8200
6168      5000
12439    20800
10972    24000
1312      7500
Name: Price, Length: 6871, dtype: int32

# Try only 10 columns 

In [None]:
oe_features = ['Furnished']
oe = OrdinalEncoder(categories=[['Unfurnished', 'Furnished']],handle_unknown='error')
data['Furnished'] = oe.fit_transform(data[oe_features])

In [None]:
ohe_feature = ['Metro','District']
ohe = OneHotEncoder(handle_unknown='error',sparse=False)
ohe.fit(data[ohe_feature])
ohe_new = pd.DataFrame(ohe.transform(data[ohe_feature]),
         columns=ohe.get_feature_names_out())
data.drop(columns=ohe_feature, inplace=True)
data = pd.concat([data,ohe_new],axis=1,join='inner')

In [47]:
data.corr()['Price'].sort_values(ascending=False)[1:31]

Size                         0.865318
N_Bedrooms                   0.777417
N_Bathrooms                  0.748154
Pool                         0.492255
Fitness Centers              0.487969
Playground                   0.481926
Floor Heating                0.461390
Tennis Courts                0.409063
Parking                      0.405901
Floor                        0.358734
Oven                         0.325565
Balcony                      0.318123
Air Filter                   0.271002
Recently renovated           0.263364
English Speaking Landlord    0.254386
Water Filter                 0.247372
Large Storage Room           0.228751
Pets_allowed                 0.210268
Garden                       0.180584
Longtitude                   0.169995
District_Pudong              0.151676
Metro_Shangcheng Rd          0.146712
Wall heating                 0.144796
Metro_Lujiazui               0.106178
Metro_Lantian Road           0.086597
Metro_Xintiandi              0.081667
Metro_Laoxim

In [10]:
data.columns

Index(['Price', 'Size', 'N_Bedrooms', 'N_Bathrooms', 'Available From', 'Floor',
       'Furnished', 'District', 'Metro', 'Latitude', 'Longtitude',
       'First_post', 'Balcony', 'Landlord lives in Shanghai', 'Oven',
       'Recently renovated', 'Air Filter', 'English Speaking Landlord',
       'Fitness Centers', 'Floor Heating', 'Garden', 'Historic Building',
       'Large Storage Room', 'Parking', 'Playground', 'Pool', 'Tennis Courts',
       'Wall heating', 'Water Filter', 'Pets_allowed'],
      dtype='object')

In [48]:
#Taking only the top 10 to train
top_10_features = data.corr()['Price'].sort_values(ascending=False)[1:31].index.tolist()

In [49]:
top_10_features

['Size',
 'N_Bedrooms',
 'N_Bathrooms',
 'Pool',
 'Fitness Centers',
 'Playground',
 'Floor Heating',
 'Tennis Courts',
 'Parking',
 'Floor',
 'Oven',
 'Balcony',
 'Air Filter',
 'Recently renovated',
 'English Speaking Landlord',
 'Water Filter',
 'Large Storage Room',
 'Pets_allowed',
 'Garden',
 'Longtitude',
 'District_Pudong',
 'Metro_Shangcheng Rd',
 'Wall heating',
 'Metro_Lujiazui',
 'Metro_Lantian Road',
 'Metro_Xintiandi',
 'Metro_Laoximen',
 'Metro_Panlong Rd',
 'Metro_Fangdian Rd',
 'Metro_West Nanjing Rd']

In [50]:
X = data[top_10_features]
y = data['Price']

In [51]:
X_train_10, X_test_10, y_train_10, y_test_10 = train_test_split(X, y, test_size=0.3)

In [52]:
scalar = MinMaxScaler()
X_train_10_new, X_test_10_new = scalar.fit_transform(X_train_10), scalar.transform(X_test_10)

In [53]:
model_DTR = DecisionTreeRegressor()
results = cross_validate(model_DTR, X_train_10_new, y_train_10, cv=5, scoring=rmse)
results['test_score'].mean()

0.19513057678874385



0.6459091966150202

In [57]:
model_DTR.fit(X_train_10_new, y_train_10)
y_DTR = model_DTR.predict(X_test_10_new)
mean_absolute_error(y_test_10,y_DTR)

1618.3758380913025