In [45]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import missingno as msno
import plotly.express as px # plotting geo data
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import cross_val_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from Own_imputers import PriorityGroupImputer

# Train

## Data

In [46]:
data = pd.read_csv("train.csv", header=0)
data = data.drop(columns=['is_promoted','sticker','price_drop_date'])

train, test = train_test_split(data.sort_values('added_time'),test_size=0.2, shuffle=False)

train = train.reset_index()
train['energy_label'] = train['energy_label'].astype('category')
train['new_building'] = train['new_building'].astype('bool')
train['postcode'] = train['postcode'].astype('category')
# train.dtypes

In [47]:
missing_values = round(train.isna().sum()/len(train)*100,2)
missing_checker = train.isna()*1
# missing_checker = missing_checker.drop(columns=missing_values.index[(missing_values>30) | (missing_values==0)].tolist()).drop(columns='energy_label').reset_index()
missing_checker = missing_checker.drop(columns=missing_values.index[missing_values==0].tolist()).reset_index(drop=True)

In [48]:
train['bedrooms_cat'] = pd.cut(train['bedrooms'], bins=[-1,0,1,2,3,4,5,6,float('inf')], labels=['0','1','2','3','4','5','6','7+'])

In [49]:
house_map = {
    'Apartments & Flats': [
        'Appartement', 'Gelijkvloers app.', 'Duplex', 'Triplex', 'Dakappartement',
        'Penthouse', 'Serviceflat', 'Assistentie-appartement', 'Studio', 'Studio met slaaphoek', 
        'App. vrij beroep', 'Appartementsgebouw'
    ],
    'Single-Family Houses': [
        'Eengezinswoning', 'Woning', 'Villa', 'Villa-landhuis', 'Moderne villa',
        'Cottage', 'Bungalow', 'Koppelwoning', 'Koppelvilla', 'Hoekwoning', 'Rijwoning', 'Bel-étage', 'Burgerswoning'
    ],
    'Historical & Luxurious Homes': [
        'Herenhuis', 'Herenwoning', 'Uitzonderlijke woning', 'Kasteel', 'Pastorijwoning'
    ],
    'Farm & Rural Houses': [
        'Hoeve', 'Boerderij', 'Fermette', 'Chalet'
    ],
    'Mixed-Use & Unique Properties': [
        'Gemengd gebruik', 'Arbeiderswoning', 'Kangoeroewoning', 'Woonboot', 'Loft',
        'Split-level', 'Patio woning', 'Buitenverblijf', 'Vakantiewoning'
    ]
}

# Function to categorize houses
def categorize_house(house_type):
    for category, types in house_map.items():
        if house_type in types:
            return category
    return 'Other'


In [50]:
train['house_type'] = train['subtype'].apply(categorize_house)

In [51]:
subtype_imputer = PriorityGroupImputer(target_cols=['subtype'], priority_groups = {1:['bedrooms_cat']}, strategy = 'mode')
train = subtype_imputer.fit_transform(train)

In [52]:
postcode_list = pd.read_csv("BE.txt", sep="\t", header=None)[[1,9,10]]
postcode_list.columns = ['postcode','lat','lon']
postcode_list['postcode']=postcode_list['postcode'].astype('category')
postcode_list = postcode_list.groupby('postcode', observed=False)[['lat','lon']].mean()

In [53]:
geo_imputer = PriorityGroupImputer(target_cols=['lat','lon'], priority_groups = {1:['postcode']}, strategy = 'external', data_source=postcode_list)
train = geo_imputer.fit_transform(train)

In [54]:
clf_geo = EllipticEnvelope(contamination=0.0002, support_fraction=0.9)
clf_geo.fit(train[['lat','lon']])
outliers_geo = clf_geo.predict(train[['lat','lon']])

train['lat'] = train['lat'].where(outliers_geo==1, np.nan)
train['lon'] = train['lon'].where(outliers_geo==1, np.nan)

train = geo_imputer.transform(train)


In [55]:
# fig = px.scatter_geo(train, lat='lat', lon='lon', scope='europe')
# fig.update_geos(showcountries=True, showcoastlines=True)
# fig.show()

In [56]:
clf_area = EllipticEnvelope(contamination=0.0002, support_fraction=0.9)
outliers_area = clf_area.fit_predict(train[['area']].dropna())
train['area'] = train['area'].where(~train['id'].isin(train[['area','id']].dropna()[outliers_area==-1]['id']), np.nan)
print(train['area'].max())

1350.0


In [57]:
area_imputer = PriorityGroupImputer(target_cols=['area'], priority_groups = {1:['bedrooms_cat','house_type'], 2:['bedrooms_cat']}, strategy = 'median')
train = area_imputer.fit_transform(train)

In [58]:
ev_imputer_median = PriorityGroupImputer(target_cols=['energy_value'], priority_groups = {1:['energy_label','province']}, strategy = 'median')
train = ev_imputer_median.fit_transform(train)

In [59]:
ev_imputer = KNNImputer(n_neighbors=10, weights='distance')
imputed_data = ev_imputer.fit_transform(train[['energy_value','area','is_appartment','new_building', 'lat','lon']])
# df_imputed = pd.DataFrame(imputed_data)
train['energy_value']= pd.DataFrame(imputed_data)[0]

In [60]:
# plt.scatter(x=train['energy_value'],y=train['area'],c=missing_checker['energy_value'])
# plt.xscale('log')
# plt.yscale('log')
# plt.xlim()

In [61]:
train['advertiser'] = train['advertiser'].fillna('Other')
train['advertiser'] = train['advertiser'].astype('category')

In [62]:
missing_checker.columns = ['area_miss', 'lat_miss', 'lon_miss', 'advertiser_miss', 'subtype_miss', 'energy_value_miss', 'energy_label_miss']
missing_checker = (missing_checker==1)
train = pd.concat([train, missing_checker], axis=1)
train['house_type'] = train['house_type'].astype('category')
train['province']=train['province'].astype('category')

In [63]:
train.dtypes

index                   int64
id                     object
is_appartment            bool
area                  float64
added_time              int64
bedrooms              float64
new_building             bool
postcode             category
lat                   float64
lon                   float64
advertiser           category
foto_amount           float64
subtype                object
energy_value          float64
energy_label         category
province             category
price                 float64
bedrooms_cat         category
house_type           category
area_miss                bool
lat_miss                 bool
lon_miss                 bool
advertiser_miss          bool
subtype_miss             bool
energy_value_miss        bool
energy_label_miss        bool
dtype: object

In [67]:
features = (['is_appartment', 'house_type',
            'area', 'bedrooms',
            'new_building','advertiser', 'lat', 'lon', 'foto_amount',
            'energy_value', 'province', 
            'area_miss', 'lat_miss', 'advertiser_miss', 'subtype_miss', 'energy_value_miss'])

In [68]:
features

['is_appartment',
 'house_type',
 'area',
 'bedrooms',
 'new_building',
 'advertiser',
 'lat',
 'lon',
 'foto_amount',
 'energy_value',
 'province',
 'area_miss',
 'lat_miss',
 'advertiser_miss',
 'subtype_miss',
 'energy_value_miss']

## Model

### KNN

In [640]:
# X_train, y_train = train[['is_appartment','area','lat','lon','foto_amount','energy_value','house_type']].copy(),train['price'].copy()

# X_train_encoded = pd.get_dummies(X_train, columns=['house_type'])

# scaler = StandardScaler()
# X_train_encoded[['area','lat','lon','energy_value','foto_amount']] = scaler.fit_transform(X_train_encoded[['area','lat','lon','energy_value','foto_amount']])

In [641]:
# scores_KNN = []
# for i in range(2,60,2):
#     neigh_model = KNeighborsRegressor(n_neighbors=i, weights='distance')
#     scores_KNN.append(cross_val_score(neigh_model, X_train_encoded, y_train, cv=5, scoring='neg_mean_absolute_percentage_error')*-1)

# scores_KNN = pd.DataFrame(scores_KNN)
# plt.scatter((scores_KNN.index+1)*2, scores_KNN.apply(np.mean, axis=1))

In [642]:
# neigh_model_final = KNeighborsRegressor(n_neighbors=14, weights='distance')
# neigh_model_final.fit(X_train_encoded,y_train)

### XGBoost

In [77]:
X_train, X_val, y_train, y_val = train_test_split(train[features],train['price'], shuffle=False, test_size=0.2)

In [71]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dval = xgb.DMatrix(X_val,label=y_val, enable_categorical=True)

In [72]:
evallist = [(dval, 'val')]

In [73]:
param_xgb = {'max_depth': 6, 'eta': 0.1, 'objective': 'reg:squarederror', 'reg_lambda':10, 'reg_alpha':10,}

xgb_regressor = xgb.train(
    params=param_xgb,
    dtrain=dtrain,
    num_boost_round=500
)
xgb_regressor.eval_set([(dval,'val')])

'[0]\tval-rmse:114141.57217356117325835'

In [74]:
X_train, y_train = train[features].copy(),train['price'].copy()
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

param_xgb = {'max_depth': 6, 'eta': 0.1, 'objective': 'reg:squarederror', 'reg_lambda':10, 'reg_alpha':10,}

xgb_regressor_final = xgb.train(
    params=param_xgb,
    dtrain=dtrain,
    num_boost_round=500
)

# Test

## Data

In [25]:
# test = pd.read_csv("test.csv", header=0)
test = test.reset_index()
# test = test.drop(columns=['is_promoted','sticker','price_drop_date'])
test['energy_label'] = test['energy_label'].astype('category')
test['new_building'] = test['new_building'].astype('bool')
test['postcode'] = test['postcode'].astype('category')

test['bedrooms_cat'] = pd.cut(test['bedrooms'], bins=[-1,0,1,2,3,4,5,6,float('inf')], labels=['0','1','2','3','4','5','6','7+'])
test['house_type'] = test['subtype'].apply(categorize_house)

In [26]:
missing_values = round(test.isna().sum()/len(test)*100,2)
missing_checker = test.isna()*1
# missing_checker = missing_checker.drop(columns=missing_values.index[(missing_values>30) | (missing_values==0)].tolist()).drop(columns='energy_label').reset_index()
missing_checker = missing_checker.drop(columns=missing_values.index[missing_values==0].tolist()).reset_index(drop=True)

missing_checker.columns = ['area_miss', 'lat_miss', 'lon_miss', 'advertiser_miss', 'subtype_miss', 'energy_value_miss', 'energy_label_miss']
missing_checker = (missing_checker==1)
test = pd.concat([test, missing_checker], axis=1)

In [27]:
test['advertiser'] = test['advertiser'].fillna('Other')
test['advertiser'] = test['advertiser'].astype('category')

In [None]:
test = subtype_imputer.transform(test)

test = geo_imputer.transform(test)

outliers_geo_test = clf_geo.predict(test[['lat','lon']])
test['lat'] = test['lat'].where(outliers_geo_test==1, np.nan)
test['lon'] = test['lon'].where(outliers_geo_test==1, np.nan)

test = geo_imputer.transform(test)

outliers_area = clf_area.predict(test[['area']].dropna())
test['area'] = test['area'].where(~test['id'].isin(test[['area','id']].dropna()[outliers_area==-1]['id']), np.nan)

test = area_imputer.transform(test)

test = ev_imputer_median.transform(test)
imputed_data = ev_imputer.transform(test[['energy_value','area','is_appartment','new_building', 'lat','lon']])
test['energy_value']= pd.DataFrame(imputed_data)[0]

test['house_type'] = test['house_type'].astype('category')
test['province']=test['province'].astype('category')

## KNN

In [None]:
# X_test = test[['is_appartment','area','lat','lon','foto_amount','energy_value','house_type']].copy()

# X_test_encoded = pd.get_dummies(X_test, columns=['house_type'])

# scaler = StandardScaler()
# X_test_encoded[['area','lat','lon','energy_value','foto_amount']] = scaler.fit_transform(X_test_encoded[['area','lat','lon','energy_value','foto_amount']])
# X_test_encoded.isna().apply(sum)
# y_test = test['price'].copy()
# y_pred_test = neigh_model_final.predict(X_test_encoded)

## XGBoost

In [None]:
X_test, y_test = test[features].copy(),test['price'].copy()
dtest = xgb.DMatrix(X_test, enable_categorical=True)

y_pred_test=xgb_regressor_final.predict(dtest)

## Error bars

In [615]:
mean_error_1=(y_pred_test/y_test-1).mean()
error_bound_1 = (y_pred_test/y_test-1).std()/len(y_pred_test)**(1/2)*1.96

mean_error_2=abs(y_pred_test/y_test-1).mean()
error_bound_2 = abs(y_pred_test/y_test-1).std()/(len(y_pred_test)**(1/2))*1.96

mean_error_3=((y_pred_test-y_test)**2).mean()**(1/2)
error_bound_3 = abs(y_pred_test-y_test).std()/(len(y_pred_test)**(1/2))*1.96

# Submission metrics

In [510]:
y_pred_final = pd.DataFrame()
y_pred_final['id']=test['id']
y_pred_final['lower']=y_pred_test*(1-mean_error_1-error_bound_1)
y_pred_final['upper']=y_pred_test*(1-mean_error_1+error_bound_1)
y_pred_final['pred']=y_pred_test*(1-mean_error_1)

y_pred_final.to_csv(f"xgb_simple_1.csv", index=False)

In [513]:
y_pred_final = pd.DataFrame()
y_pred_final['id']=test['id']
y_pred_final['lower']=y_pred_test*(1-mean_error_2-error_bound_2)
y_pred_final['upper']=y_pred_test*(1+mean_error_2+error_bound_2)
y_pred_final['pred']=y_pred_test

y_pred_final.to_csv(f"xgb_simple_2.csv", index=False)

In [512]:
y_pred_final = pd.DataFrame()
y_pred_final['id']=test['id']
y_pred_final['lower']=np.maximum(y_pred_test-mean_error_3-error_bound_3,0)
y_pred_final['upper']=y_pred_test+mean_error_3+error_bound_3
y_pred_final['pred']=y_pred_test

y_pred_final.to_csv(f"xgb_simple_3.csv", index=False)