In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import pandas_profiling
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
CATEGORICAL_COLUMNS = ["bldg_ctgy", "bldg_cls_s", "bldg_cls_p", "tax_cls_p", "tax_cls_s"]
LABEL_COLUMN = "price"
SECOND_TEST_COLUMNS = ["price", "bldg_cls_s", "borough", "lot", "tot_sqft"]
FIRST_TEST_COLUMNS = [*SECOND_TEST_COLUMNS, "long", "lat", "block", "lot", "yr_built", "tot_unit", "bldg_ctgy"]

In [3]:
def encode_column(data, column):
    encoded_columns = pd.get_dummies(data[column], prefix=column)
    return pd.concat([data, encoded_columns], axis=1).drop(column, axis=1)

In [4]:
# raw_data = pd.read_csv("sales_data_2015.csv")
# pd.set_option("display.max_columns", None)

# # Some pre-processing
# data = raw_data[(raw_data.usable == 'Y') & 
#                 (raw_data.price > 1000) &
#                 (raw_data.tot_sqft > 0) &
#                 (raw_data.yr_built > 0) &
#                 (raw_data.tot_unit > 0)]
# data = data.drop(columns=["Unnamed: 0", "year", "apt", "easmnt", "usable", "address", "bbl_id", "Sale_id", "sale_date"])
# data.isna().sum().sort_values(ascending=False)
data = pd.read_csv("ny_housing.csv")

features = {}
features[2] = list(data.drop(LABEL_COLUMN, axis=1).columns)
features[0] = [f for f in features[2] if f not in FIRST_TEST_COLUMNS]
features[1] = [f for f in features[2] if f not in SECOND_TEST_COLUMNS]

# for category in CATEGORICAL_COLUMNS:
#     data = encode_column(data, category)

# display(data)
# data.to_csv("ny_housing.csv")

# data = pd.read_csv("ny_housing.csv")

In [5]:
data["price_class"] = np.where(data["price"] > 500000, 1, 0)


In [6]:
records = []

for train_size in [0.2, 0.5, 0.8]:

    train, test = train_test_split(data, test_size=1 - train_size)
    print(len(train), 'train examples')
    print(len(test), 'test examples')
    
    for features_version in [0, 1, 2]:
        
        print("Train size: ", train_size, "Features version: ", features_version)
        
        columns_X = [col for col in data if any([col.startswith(feature) for feature in features[features_version]])]
        columns_Y = [LABEL_COLUMN]
         
        train_ds = train[[*columns_X, *columns_Y, "price_class"]]
        test_ds = test[[*columns_X, *columns_Y, "price_class"]]
        
        train_X = train_ds[columns_X]
        train_Y = train_ds[columns_Y]
        test_X = test_ds[columns_X]
        test_Y = test_ds[columns_Y]
        
        time_before = time.perf_counter()
#         model = LinearRegression().fit(train_X, train_Y)
        model = RandomForestClassifier(n_estimators=10,max_depth=16).fit(train_X, train_Y)

        time_taken = time.perf_counter() - time_before
        
        train_score = accuracy_score(train_ds["price_class"], np.where(model.predict(train_X) > 500000, 1, 0))
        test_score = accuracy_score(test_ds["price_class"], np.where(model.predict(test_X) > 500000, 1, 0))
                                    
        records.append([time_taken, train_score, test_score])

5577 train examples
22309 test examples
Train size:  0.2 Features version:  0




Train size:  0.2 Features version:  1




Train size:  0.2 Features version:  2




13943 train examples
13943 test examples
Train size:  0.5 Features version:  0




Train size:  0.5 Features version:  1




Train size:  0.5 Features version:  2




22308 train examples
5578 test examples
Train size:  0.8 Features version:  0




Train size:  0.8 Features version:  1




Train size:  0.8 Features version:  2




In [13]:
results = []
i = 0
for train_size in [0.2, 0.5, 0.8]:
    for features_version in [0, 1, 2]:
        results.append({
            'split': train_size, 
            'features': features_version,
            'time': round(records[i][0], 3) ,
            'train_acc': round(records[i][1], 3),
            'test_acc': round(records[i][2], 3)
        })
        
        i = i+1

In [14]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,split,features,time,train_acc,test_acc
0,0.2,0,0.478,0.828,0.723
1,0.2,1,0.726,0.857,0.741
2,0.2,2,0.817,0.884,0.742
3,0.5,0,2.183,0.846,0.763
4,0.5,1,2.75,0.849,0.779
5,0.5,2,2.914,0.864,0.775
6,0.8,0,3.967,0.829,0.77
7,0.8,1,5.708,0.852,0.79
8,0.8,2,5.675,0.864,0.774


In [15]:
results_df.to_csv('RF_ny_housing_result.csv')

In [16]:
print(results_df.to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  split &  features &   time &  train\_acc &  test\_acc \\
\midrule
0 &    0.2 &         0 &  0.478 &      0.828 &     0.723 \\
1 &    0.2 &         1 &  0.726 &      0.857 &     0.741 \\
2 &    0.2 &         2 &  0.817 &      0.884 &     0.742 \\
3 &    0.5 &         0 &  2.183 &      0.846 &     0.763 \\
4 &    0.5 &         1 &  2.750 &      0.849 &     0.779 \\
5 &    0.5 &         2 &  2.914 &      0.864 &     0.775 \\
6 &    0.8 &         0 &  3.967 &      0.829 &     0.770 \\
7 &    0.8 &         1 &  5.708 &      0.852 &     0.790 \\
8 &    0.8 &         2 &  5.675 &      0.864 &     0.774 \\
\bottomrule
\end{tabular}

