In [2]:
import pandas as pd
#Dataimport
data_link="https://raw.githubusercontent.com/Nicki-Bladal/Estimating_Japan_Real_Estate_Pricing_Machine_Learning/master/01.csv"
data_df = pd.read_csv(data_link,parse_dates=True,index_col="No")

In [34]:
# Investigating "Cardinality" 
categorical_features = [cname for cname in data_df.columns if data_df[cname].dtype == "object"]

# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: data_df[col].nunique(), categorical_features))
d = dict(zip(categorical_features, object_nunique))

# Print number of unique entries by column, in ascending order
print("Listed Categorical variables and Cardinity")
sorted(d.items(), key=lambda x: x[1])


Listed Categorical variables and Cardinity


[('Prefecture', 1),
 ('Renovation', 2),
 ('Region', 4),
 ('Type', 5),
 ('Purpose', 6),
 ('LandShape', 9),
 ('Direction', 9),
 ('Classification', 14),
 ('CityPlanning', 16),
 ('Remarks', 17),
 ('Structure', 22),
 ('FloorPlan', 26),
 ('TimeToNearestStation', 34),
 ('Period', 57),
 ('Use', 115),
 ('Municipality', 188),
 ('NearestStation', 461),
 ('DistrictName', 4641)]

Choosing Features for model
-

In [50]:
#TimeToNearestStation and MaxTimeToNearestStation are removed not to have 3 time factors 
#Municipality is dropped as Municipality code will suffice
#Period is removed as Year + Quarter contain same info
# TotalFloorAreaIsGreaterFlag and FrontageIsGreaterFlag are always True and is therefore removed
#PricePerTsubo is directly related to Price and is therefore a DataLeakage source which must be removed

X=data_df.drop(['Region','MaxTimeToNearestStation','TimeToNearestStation','Municipality','Period', 'BuildingYear','TotalFloorAreaIsGreaterFlag','FrontageIsGreaterFlag','PricePerTsubo', 'Prefecture', 'Classification', 'TotalFloorArea', 'Quarter', 'Breadth', 'FloorPlan', 'AreaIsGreaterFlag', 'NearestStation', 'LandShape', 'Purpose', 'Frontage', 'UnitPrice', 'CoverageRatio', 'PrewarBuilding', 'CityPlanning', 'Structure', 'FloorAreaRatio', 'Renovation', 'TradePrice', 'Remarks', 'Use', 'DistrictName', 'Direction'], axis=1)

y=data_df.TradePrice

In [51]:
categorical_features = [cname for cname in X.columns if X[cname].dtype == "object"]
# As the cardinality spans from 2 to more than 4500 we cannot use HotPot imputation for all strings - therefor we stick to object cols

# Numericalcolumns
numeric_features = [cname for cname in X.columns if data_df[cname].dtype in ['int64', 'float64']]

X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186238 entries, 1 to 186238
Data columns (total 5 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Type                     186238 non-null  object 
 1   MunicipalityCode         186238 non-null  int64  
 2   MinTimeToNearestStation  144490 non-null  float64
 3   Area                     186238 non-null  int64  
 4   Year                     186238 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 8.5+ MB


Setting up trainingdata (for internal test)
-

In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Adressing missing values
-

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('label', OneHotEncoder())])

In [59]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Fitting the classifiers
-

In [60]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=10))])


In [61]:
rf.fit(X_train, y_train.values)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

Testing the accuracy
-

In [62]:
from sklearn.metrics import mean_absolute_error

# Preprocessing of validation data, get predictions
preds = rf.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)

print('Mean TradePrice', y.mean())
print('MAE:', score)
print('Relative Error', (y.mean()-score)/y.mean())

Mean TradePrice 16827899.950869318
MAE: 11502729.9154317
Relative Error 0.31644887662661214
