In [52]:
import pandas as pd
#Dataimport
data_link="https://raw.githubusercontent.com/Nicki-Bladal/Estimating_Japan_Real_Estate_Pricing_Machine_Learning/master/01.csv"
data_df = pd.read_csv(data_link,parse_dates=True,index_col="No")

Removing obvious problematic features
-

In [53]:
#TimeToNearestStation and MaxTimeToNearestStation are removed not to have 3 time factors 
#Municipality is dropped as Municipality code will suffice
#Period is removed as Year + Quarter contain same info
# TotalFloorAreaIsGreaterFlag and FrontageIsGreaterFlag are always True and is therefore removed
#PricePerTsubo is directly related to Price and is therefore a DataLeakage source which must be removed

X=data_df.drop(['MaxTimeToNearestStation','TimeToNearestStation','Municipality','Period','TotalFloorAreaIsGreaterFlag','FrontageIsGreaterFlag','PricePerTsubo','TradePrice'], axis=1)

y=data_df.TradePrice

In [54]:
#Checking datatypes and non-null counts
print(X.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186238 entries, 1 to 186238
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Type                     186238 non-null  object 
 1   Region                   133069 non-null  object 
 2   MunicipalityCode         186238 non-null  int64  
 3   Prefecture               186238 non-null  object 
 4   DistrictName             183567 non-null  object 
 5   NearestStation           145929 non-null  object 
 6   MinTimeToNearestStation  144490 non-null  float64
 7   FloorPlan                20486 non-null   object 
 8   Area                     186238 non-null  int64  
 9   AreaIsGreaterFlag        186238 non-null  int64  
 10  UnitPrice                66260 non-null   float64
 11  LandShape                132674 non-null  object 
 12  Frontage                 125257 non-null  float64
 13  TotalFloorArea           62490 non-null   float64
 14  Buil

Sorting strings and numbers
-

In [55]:
# Investigating "Cardinality" 
categorical_features = [cname for cname in X.columns if X[cname].dtype == "object"]

# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X[col].nunique(), categorical_features))
d = dict(zip(categorical_features, object_nunique))

# Print number of unique entries by column, in ascending order
print("Listed Categorical variables and Cardinity")
sorted(d.items(), key=lambda x: x[1])

Listed Categorical variables and Cardinity


[('Prefecture', 1),
 ('Renovation', 2),
 ('Region', 4),
 ('Type', 5),
 ('Purpose', 6),
 ('LandShape', 9),
 ('Direction', 9),
 ('Classification', 14),
 ('CityPlanning', 16),
 ('Remarks', 17),
 ('Structure', 22),
 ('FloorPlan', 26),
 ('Use', 115),
 ('NearestStation', 461),
 ('DistrictName', 4641)]

In [56]:
# As the cardinality spans from 2 to more than 4500 we cannot use HotPot imputation for all strings - therefor we stick to object cols

# Numericalcolumns
numeric_features = [cname for cname in X.columns if data_df[cname].dtype in ['int64', 'float64']]

# Storing the columnnames
my_cols = categorical_features + numeric_features

Setting up trainingdata (for internal test)
-

In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Adressing missing values
-

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('label', LabelEncoder())])

In [59]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Fitting the classifiers
-

In [60]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [61]:
rf.fit(X_train, y_train)

TypeError: fit_transform() takes 2 positional arguments but 3 were given