In [22]:
import os
import pandas as pd
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import numpy as np
from preprocessing.data_processing import combined_encoding, generate_features, clean_data, process_data
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
pd.options.plotting.backend = "plotly"

In [23]:
DATA_DIR = './data'
train_final = os.path.join(DATA_DIR, 'train_final.csv')
test_final = os.path.join(DATA_DIR, 'test_final.csv')
train_df, test_df = pd.read_csv(train_final), pd.read_csv(test_final)

In [24]:
features = ['floor_area_sqm',
                         'age',
                         'town_psqm',
                         'regional_psqm',  
                         'nearest_mrt_dist',
                          'near_mrt_count_1.0',
                         'near_mall_count_1.0',
                         'near_school_count_1.0', 
                         'nearest_mall_dist',
                         'nearest_school_dist',
                         'date',
                         ]
cat_features = ['flat_type', 'flat_model', 'region', 'subzone', 'planning_area', 'town']
target = 'monthly_rent'
feature_norm = ''

In [25]:
num_features = features
ori_cat_features = cat_features
train_df_cleaned = clean_data(train_df)
test_df_cleaned = clean_data(test_df)
processed_train = process_data(train_df_cleaned)
processed_test = process_data(test_df_cleaned, mode='test')
model = make_pipeline(StandardScaler(), LinearRegression())
print(model)
#   print(model)
train_df, test_df = generate_features(processed_train, processed_test)
print(len(train_df), len(test_df))

if len(cat_features) != 0:
    joined_df = combined_encoding(train_df, test_df, cat_features=cat_features,num_features=num_features)
train_df, test_df = joined_df[joined_df['split'] == 'train'].reset_index(drop=True), joined_df[joined_df['split'] == 'test'].reset_index(drop=True)
# print(df.head())

cat_interval_map = {}
cat_one_hot_cols = []
curr_start = len(features) # skip numerical features
for cat_feat in cat_features:
    curr_end = curr_start + len(joined_df[cat_feat].unique())
    cat_interval_map[cat_feat] = (curr_start, curr_end)
    curr_start = curr_end
    for value in joined_df[cat_feat].unique():
        cat_one_hot_cols.append(cat_feat + '_' + value)


if len(feature_norm) != 0:
    features = list(map(lambda x: x + '_' + feature_norm, features))

features = features + cat_one_hot_cols

X = train_df[features].to_numpy()
y = train_df[target].to_numpy()
#   X = scalar.fit_transform(X)
model.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
60000 30000


In [29]:
coefficients = model.named_steps['linearregression'].coef_


In [48]:
import numpy as np
coef_dict = {"feat_name": [], "coef": []}
idx = 0
for feat in num_features:
    coef_dict['feat_name'].append(feat)
    coef_dict['coef'].append(coefficients[idx])
    idx += 1
# for feat in ori_cat_features:
#     coef_dict['feat_name'].append(feat)
#     feat_range = cat_interval_map[feat]
#     coef_dict['coef'].append(np.array(coefficients[feat_range[0]: feat_range[1]]).sum())

In [49]:
coef_dict

{'feat_name': ['floor_area_sqm',
  'age',
  'town_psqm',
  'regional_psqm',
  'nearest_mrt_dist',
  'near_mrt_count_1.0',
  'near_mall_count_1.0',
  'near_school_count_1.0',
  'nearest_mall_dist',
  'nearest_school_dist',
  'date'],
 'coef': [76.31924614124934,
  -139.67780795069825,
  318.1615603083678,
  138.16985770660867,
  -54.22574068794301,
  7.8045378633448195,
  -2.5534120388359085,
  -11.866072102426891,
  -36.235445362679485,
  -1.0318740912845357,
  42.238063840042855]}

In [52]:
pd.DataFrame.from_dict(coef_dict).plot.bar(x='coef', y='feat_name', orientation='h', height=400, width=800)