In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures as PF, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import pickle

### To test this solution just fix the names of X_test's and Y_test's files at the end and run the whole thing
# 
# Lets load the pipeline and the models
# 
### **Warning:**  *the Encoders and Scalers should be fitted only on training data*

In [2]:
building_type_dict = dict([('monolit', 2), ('panel', 1), ('stone', 1.44), ('other', 1.68)])
condition_dict = dict([('good', 1), ('newly repaired', 1.4), ('zero condition', 1.3)])

poly = PF(5, include_bias = False)
ohe = OneHotEncoder(min_frequency=10, sparse_output=False, handle_unknown='infrequent_if_exist')
poly.set_output(transform = 'pandas')
ohe.set_output(transform='pandas')

def housing_pipeline(df_orig, fit_enc=True):
    df1 = df_orig._get_numeric_data()
    if 'price' in df1.columns:
        df1.drop('price', axis = 1, inplace = True)
        
    df1['url_num'] = df_orig.url.map(lambda x: x[31:]).map(lambda x: int(x[:x.find('/')]))
    df1['building_type_val'] = df_orig.building_type.map(lambda x: building_type_dict[x])
    df1['condition_val'] = df_orig.condition.map(lambda x: condition_dict[x])
    
    
    ohe_feats = ['building_type', 'condition', 'district', 'street']
    if fit_enc:
        poly.fit(df1)
        ohe.fit(df_orig[ohe_feats])
    df1 = poly.transform(df1)
    df_ohe = ohe.transform(df_orig[ohe_feats])
    
    len_temp = df1.shape[1]
    df1 = pd.concat([df1, df_ohe], axis=1)
    df_orig['url_script'] = df_orig.url.map(lambda x: x[34:]).map(lambda x: x[x.find('-'):])
    df1['english_seller'] = (df_orig['url_script'] == '-for-sale-in-Yerevan')
    df1['is_elite'] = ((df1['area']>=115) & (df1['ceiling_height']>=2.9) & (df_orig['building_type']=='monolit') & (df_orig['condition'] == 'newly repaired'))*1
    df1['big_ceiling'] = ((df1['ceiling_height']>2.8) & (df1['ceiling_height']<3.3))*1

    for i in df1.copy().iloc[:, len_temp:].columns:
        temp_df = pd.concat([df1['area^3']*df1[i],
                             df1['area^4']*df1[i],
                             df1['ceiling_height^2']*df1[i],
                             df1['max_floor^3']*df1[i],
                             df1['max_floor^4']*df1[i]],
                           axis=1)
        temp_df.columns = [f'area^3*{i}', f'area^4*{i}', f'ceiling_height^2*{i}', f'max_floor^3*{i}', f'max_floor^4*{i}']
        df1 = pd.concat([df1, temp_df], axis=1)
    
    return df1

In [3]:
x_scaler = MinMaxScaler()
x_scaler.set_output(transform='pandas')
y_scaler = MinMaxScaler()

In [4]:
X = housing_pipeline(pd.read_csv('houses_train.csv', index_col='Unnamed: 0').drop(['price'],axis=1))
Y = pd.read_csv('houses_train.csv')['price'].values.reshape(-1, 1)
X = X[Y<400000]
Y = Y[Y<400000].reshape(-1,1)

X = x_scaler.fit_transform(X)
Y = y_scaler.fit_transform(Y)

reg_l = 0
reg_r = 0
with open('reg_l.pkl', 'rb') as file:
    reg_l = pickle.load(file)
coefs = dict((X.columns[ind],i) for ind, i in enumerate(np.abs(reg_l.coef_)) if i>0)

with open('reg_r.pkl', 'rb') as file:
    reg_r = pickle.load(file)


In [5]:
# should be 190
len(coefs)

190

In [6]:
print('mae and rmse of Lasso in train data')
mae(y_scaler.inverse_transform(Y), y_scaler.inverse_transform(reg_l.predict(X).reshape(-1,1))), np.sqrt(mse(y_scaler.inverse_transform(Y), y_scaler.inverse_transform(reg_l.predict(X).reshape(-1,1))))

mae and rmse of Lasso in train data


(13534.100709361946, 20112.81324238748)

In [7]:
print('mae and rmse of Ridge in train data')
mae(y_scaler.inverse_transform(Y), y_scaler.inverse_transform(reg_r.predict(X[coefs.keys()]).reshape(-1,1))), np.sqrt(mse(y_scaler.inverse_transform(Y), y_scaler.inverse_transform(reg_r.predict(X[coefs.keys()]).reshape(-1,1))))

mae and rmse of Ridge in train data


(13324.13847568569, 19641.5233146517)

# 
# Now we need to load the test data, filenames should be matching
# 

In [8]:
X_test = pd.read_csv('houses_test.csv', index_col='Unnamed: 0')
Y_test = pd.read_csv('houses_test_answers.csv', index_col='Unnamed: 0').values

X_test = housing_pipeline(X_test, fit_enc=False)
X_test = x_scaler.transform(X_test)

In [9]:
pred1 = reg_l.predict(X_test).reshape(-1,1)
pred2 = reg_r.predict(X_test[coefs.keys()]).reshape(-1,1)

pred1 = y_scaler.inverse_transform(pred1)
pred2 = y_scaler.inverse_transform(pred2)


ans = np.rint((pred1+pred2)/2000) * 1000

# 
# comparing the results
# 

In [10]:
np.sqrt(mse(Y_test, ans))

19353.44332321951