In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures as PF, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [129]:
building_type_dict = dict([('monolit', 2), ('panel', 1), ('stone', 1.44), ('other', '1.68')])
condition_dict = dict([('good', 1), ('newly repaired', 1.4), ('zero condition', 1.3)])

poly = PF(5, include_bias = False)
ohe = OneHotEncoder(min_frequency=10, sparse_output=False, handle_unknown='infrequent_if_exist')
poly.set_output(transform = 'pandas')
ohe.set_output(transform='pandas')

def housing_pipeline(df_orig, fit_enc=True):
    #df_orig.drop(df_orig[df_orig['price']>360000].index, inplace = True)
    #df_orig.index = np.arange(len(df_orig))
    df1 = df_orig._get_numeric_data()
    #df1.drop('price', axis = 1, inplace = True)
    df1['url_num'] = df_orig.url.map(lambda x: x[31:]).map(lambda x: int(x[:x.find('/')]))
    df1['building_type'] = df_orig.building_type.map(lambda x: building_type_dict[x])
    df1['condition'] = df_orig.condition.map(lambda x: condition_dict[x])
    
    
    ohe_feats = ['building_type', 'condition', 'district', 'street']
    if fit_enc:
        poly.fit(df1)
        ohe.fit(df_orig[ohe_feats])
    df1 = poly.transform(df1)
    df_ohe = ohe.transform(df_orig[ohe_feats])
    
    len_temp = df1.shape[1]
    df1 = pd.concat([df1, df_ohe], axis=1)
    df_orig['url_script'] = df_orig.url.map(lambda x: x[34:]).map(lambda x: x[x.find('-'):])
    df1['english_seller'] = (df_orig['url_script'] == '-for-sale-in-Yerevan')*1
    df1['is_elite'] = ((df1['area']>=115) & (df1['ceiling_height']>=2.9) & (df_orig['building_type']=='monolit') & (df_orig['condition'] == 'newly repaired'))*1
    df1['big_ceiling'] = ((df1['ceiling_height']>2.8) & (df1['ceiling_height']<3.3))*1

    for i in df1.copy().iloc[:, len_temp:].columns:
        temp_df = pd.concat([df1['area^3']*df1[i],
                             df1['area^4']*df1[i],
                             df1['ceiling_height^2']*df1[i],
                             df1['max_floor^3']*df1[i],
                             df1['max_floor^4']*df1[i]],
                           axis=1)
        temp_df.columns = [f'area^3*{i}', f'area^4*{i}', f'ceiling_height^2*{i}', f'max_floor^3*{i}', f'max_floor^4*{i}']
        df1 = pd.concat([df1, temp_df], axis=1)
    
    return df1

In [22]:
def reset_indx(df_to_reset):
    df_to_reset.index = np.arange(len(df_to_reset))

In [23]:
x_scaler = MinMaxScaler()
x_scaler.set_output(transform='pandas')
y_scaler = MinMaxScaler()

In [5]:
1/0

ZeroDivisionError: division by zero

In [84]:
poly_deg = 1

def kf_testing(x, y, features_kf, splits=10, alpha_l=1e-6, alpha_r = 0.1):
    y=y.reshape(-1,1)
    reg_l_kf = Lasso(alpha = alpha_l, max_iter=int(1e9))
    reg = Ridge(alpha = alpha_r, max_iter = int(1e9))
    cv = KFold(n_splits=splits, random_state=1, shuffle=True)
    train_score_l=[]
    test_score_l=[]
    train_score=[]
    test_score=[]
    sm_lasso = 0
    sm_combined = 0
    
    for train_index, test_index in cv.split(x):
        x_train, x_test, y_train, y_test = x.loc[train_index, :], x.loc[test_index, :], y[train_index], y[test_index]
        print('a')
        
        x_train = housing_pipeline(x_train)
        x_test = housing_pipeline(x_test, fit_enc=False)
        
        x_train = x_scaler.fit_transform(x_train)
        y_train = y_scaler.fit_transform(y_train)
        x_test = x_scaler.transform(x_test)
        
        reg_l_kf.fit(x_train, y_train)
        y_pred1_l = reg_l_kf.predict(x_train).reshape(-1,1)
        y_pred2_l = reg_l_kf.predict(x_test).reshape(-1,1)
        y_train = y_scaler.inverse_transform(y_train)
        #y_test = np.e**y_test
        y_pred1_l = y_scaler.inverse_transform(y_pred1_l)
        y_pred2_l = y_scaler.inverse_transform(y_pred2_l)      
        train_score_l.append(np.array([mae(y_train, np.rint(y_pred1_l/1000)*1000), np.sqrt(mse(y_train, np.rint(y_pred1_l/1000)*1000))]))
        test_score_l.append(np.array([mae(y_test, np.rint(y_pred2_l/1000)*1000), np.sqrt(mse(y_test, np.rint(y_pred2_l/1000)*1000))]))
        print(train_score_l[-1][1], test_score_l[-1][1], end=' | ')
        
        y_train = y_scaler.transform(y_train)
        coefs_kf = dict((x_train.columns[ind], i) for ind, i in enumerate(np.abs(reg_l_kf.coef_)) if i>0)
        reg.fit(x_train[coefs_kf.keys()], y_train)

        
        y_pred1 = reg.predict(x_train[coefs_kf.keys()]).reshape(-1,1)
        y_pred2 = reg.predict(x_test[coefs_kf.keys()]).reshape(-1,1)
        y_train = y_scaler.inverse_transform(y_train)
        #y_test = np.e**y_test
        y_pred1 = y_scaler.inverse_transform(y_pred1)
        y_pred2 = y_scaler.inverse_transform(y_pred2)
        #y_pred1 = y_scaler.inverse_transform(y_pred1)
        #y_pred2 = y_scaler.inverse_transform(y_pred2)

        train_score.append(np.array([mae(y_train, np.rint(y_pred1/1000)*1000), np.sqrt(mse(y_train, np.rint(y_pred1/1000)*1000))]))
        test_score.append(np.array([mae(y_test, np.rint(y_pred2/1000)*1000), np.sqrt(mse(y_test, np.rint(y_pred2/1000)*1000))]))
        print(train_score[-1][1], test_score[-1][1], end=' | ')
        
        sm_combined+=np.sqrt(mse(y_test, np.rint((y_pred2_l+y_pred2)/2000)*1000))
        print(np.sqrt(mse(y_test, np.rint((y_pred2_l+y_pred2)/2000)*1000)))
        
        
    print('Ridge train =', np.array(train_score).mean(axis = 0))
    print('Ridge validation =', np.array(test_score).mean(axis = 0))
    print('Lasso validation =', np.array(test_score_l).mean(axis = 0))
    print('Lasso + Ridge validation =', sm_combined/splits)

# hyperparameter selection

In [None]:
df = pd.read_csv('houses_train.csv')
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(['price'], axis = 1), df['price'], test_size=0.1, random_state=1)
reset_indx(X_train)
reset_indx(X_test)
reset_indx(Y_train)
reset_indx(Y_test)

#for poly_deg in range(4, 5):
poly_deg = 4
df_x_train = housing_pipeline(X_train, deg = poly_deg)
y = Y_train

df_x_train = x_scaler.fit_transform(df_x_train)
y = y.values.reshape(-1,1)
y = y_scaler.fit_transform(y)

print(df_x_train.shape, y.shape)

for l_a in np.arange(0.00001, 0.001, 0.00001):
    print(l_a)
    kf_testing(x=df_x_train, y=y, lasso_alpha=l_a, features_kf=df_x_train.columns)

In [None]:
mn = 3e9
kk = 0
for k,v in all_train_scores.items():
    if mn > v[1]:
        kk=k
        mn=v[1]
mn, kk

# end of hyperparameter selection
# 
# testing out new models

In [95]:
df = housing_pipeline(pd.read_csv('houses_train.csv'))
Y = pd.read_csv('houses_train.csv')['price']
Y = Y[Y<=360000]
X_train, X_test, Y_train, Y_test = train_test_split(df, Y.values.reshape(-1,1),
                                                    test_size=0.1,
                                                    random_state=1)
X = x_scaler.fit_transform(X_train)
Y = y_scaler.fit_transform(Y_train)
reg = Lasso(alpha = 6e-6, max_iter = int(1e9))
reg.fit(X, Y)

In [96]:
np.sqrt(mse(y_scaler.inverse_transform(Y), y_scaler.inverse_transform(reg.predict(X).reshape(-1,1))))

23505.081057984025

In [97]:
np.sqrt(mse(Y_test, y_scaler.inverse_transform(reg.predict(x_scaler.transform(X_test)).reshape(-1,1))))

24081.37460954546

In [99]:
df = housing_pipeline(pd.read_csv('houses_train.csv'))
Y = pd.read_csv('houses_train.csv')['price']
Y = Y[Y<=360000]
X_train, X_test, Y_train, Y_test = train_test_split(x_scaler.fit_transform(df),
                                                    y_scaler.fit_transform(Y.values.reshape(-1, 1)),
                                                    test_size=0.1,
                                                    random_state=1)
reg = Lasso(alpha = 6e-6, max_iter = int(1e9))
reg.fit(X_train, Y_train)

In [100]:
np.sqrt(mse(y_scaler.inverse_transform(Y_train), y_scaler.inverse_transform(reg.predict(X_train).reshape(-1,1))))

23507.936799406376

In [101]:
np.sqrt(mse(y_scaler.inverse_transform(Y_test), y_scaler.inverse_transform(reg.predict(X_test).reshape(-1,1))))

24077.85544912366

In [None]:
df = housing_pipeline(pd.read_csv('houses_train.csv'))
Y = pd.read_csv('houses_train.csv')['price']
Y = Y[Y<=360000]

X = x_scaler.fit_transform(df)
Y = y_scaler.fit_transform(Y)
reg = Lasso(alpha = 6e-6, max_iter = int(1e9))
reg.fit(X, Y)

In [None]:
np.sqrt(mse(y_scaler.inverse_transform(Y), y_scaler.inverse_transform(reg.predict(X).reshape(-1,1))))

# kf testing the results of lasso

In [11]:
df = housing_pipeline(pd.read_csv('houses_train.csv').drop(['price'],axis=1))
Y = pd.read_csv('houses_train.csv')['price'].values.reshape(-1, 1)
#Y = Y[Y<=360000].values.reshape(-1,1)

df = x_scaler.fit_transform(df)
Y = y_scaler.fit_transform(Y)

reg_l = Lasso(alpha=3e-5, max_iter=int(1e9))
reg_l.fit(df, Y)
mae(y_scaler.inverse_transform(Y), y_scaler.inverse_transform(reg_l.predict(df).reshape(-1,1))), np.sqrt(mse(y_scaler.inverse_transform(Y), y_scaler.inverse_transform(reg_l.predict(df).reshape(-1,1))))

(13647.77161060595, 20478.01831953173)

In [12]:
coefs = dict((df.columns[ind],i) for ind, i in enumerate(np.abs(reg_l.coef_)) if i>0)

In [13]:
len(coefs)

208

In [130]:
df = pd.read_csv('houses_train.csv', index_col='Unnamed: 0').drop(['price'],axis=1)
Y = pd.read_csv('houses_train.csv')['price'].values.reshape(-1, 1)
df = df[Y<=360000]
df.index = np.arange(len(df))
Y = Y[Y<=360000]

kf_testing(df, Y, coefs.keys(), splits=10, alpha_l = 4.5e-5, alpha_r = 1e-6)

a
20006.184555142547 21314.040536744786 | 19549.836449525646 21957.839329087004 | 21467.7226552329
a
20095.37072757669 20893.074450640335 | 19629.58234802947 20742.269885429607 | 20697.858826458356
a
20165.354358061046 20441.01819871016 | 19646.55592675818 20257.730006098907 | 20191.29082054934
a
20097.28167725194 21596.850233309488 | 19639.753455323844 21455.468766727052 | 21420.334731278126
a
20064.531298545815 21354.472178023974 | 19571.336800151217 20983.485268229393 | 21051.353163205447
a
20053.899215489186 21293.342861091587 | 19542.64892973759 21125.27231067093 | 21076.39651838046
a
20032.47826835847 21347.2109888388 | 19532.19702124306 21470.491866792432 | 21283.688989505554
a
20251.149143187125 19493.557397253073 | 19718.35686415548 19710.38254321818 | 19469.627115073366
a
19737.14304452147 23732.833855391465 | 19248.111763151366 23635.2577057165 | 23552.8002752031
a
19954.71935228706 22411.482442993038 | 19492.90355427372 22186.900398672213 | 22192.17468922038
Ridge train = [

In [136]:
df = pd.read_csv('houses_train.csv', index_col='Unnamed: 0').drop(['price'],axis=1)
Y = pd.read_csv('houses_train.csv')['price'].values.reshape(-1, 1)
df = df[Y<400000]
df.index = np.arange(len(df))
Y = Y[Y<400000]

kf_testing(df, Y, coefs.keys(), splits=10, alpha_l = 4e-5, alpha_r = 1e-6)

a
19900.43971711775 21318.187540267114 | 19435.596495012465 22949.839215166627 | 21793.455898548993
a
19998.830519415038 20831.153592636198 | 19538.84084932409 20906.452592441405 | 20706.630822033796
a
20053.623215508625 20337.896277638945 | 19588.870334321928 20203.1785865492 | 20139.60339728665
a
20000.730784809133 21561.436408551264 | 19570.026691030314 21482.08416332084 | 21399.07334442312
a
19951.870747691613 21252.634613195605 | 19491.692101028995 20854.444658201763 | 20932.431058097383
a
19937.615086930586 21220.565737039153 | 19474.91432975767 21022.43968239652 | 21016.938173768318
a
19922.241106001053 21324.33391695975 | 19422.377931041825 21464.818214976804 | 21285.765689821917
a
20127.87196161585 19467.315685527883 | 19620.52478169179 20065.3427580991 | 19661.52537317489
a
19626.167224419325 23677.75716512739 | 19159.348851452312 23727.860897424678 | 23550.962436073194
a
19856.080406728586 22311.064506222054 | 19410.58475222231 22132.94835777683 | 22152.451205019763
Ridge tr

In [68]:
df = housing_pipeline(pd.read_csv('houses_train.csv'))
Y = pd.read_csv('houses_train.csv')['price'].values.reshape(-1, 1)
#Y = Y[Y<=360000]

kf_testing(df, Y, coefs.keys(), alpha = 1e-6)

a
[13432.88444444 19998.24257392] [13630.54091816 20290.97928443]
a
[13362.49766719 19887.07009115] [14218.598      21579.54753933]
a
[13464.8371473 20173.5036755] [13552.898      18788.98877007]
a
[13406.17218396 19971.74337614] [13773.         20832.27159961]
a
[13158.02066208 19721.19544019] [15562.5        22871.20296355]
a
[13344.55476561 19984.84737373] [14261.88       20611.52942409]
a
[13272.87958232 19715.36928343] [14817.7        23024.93919645]
a
[13543.47922684 19968.69081888] [13381.402      20783.49248808]
a
[13477.73828038 19781.49277452] [13496.002     22404.3488636]
a
[13326.73028216 19935.72056023] [14703.58       21179.30216509]
train = [13378.97942423 19913.78759677]
test = [14139.81009182 21236.66022943]


In [99]:
df

Unnamed: 0.1,Unnamed: 0,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4993,3585,newly repaired,Arabkir,5,Griboedov St,3,Yerevan,97.0,http://www.myrealty.am/en/item/36852/3-senyaka...,1,stone,4,2.8
4994,3291,newly repaired,Arabkir,4,Orbeli Yeghbayrner St,3,Yerevan,71.0,http://www.myrealty.am/en/item/13933/Apartment...,1,stone,4,2.8
4995,5959,zero condition,Center,5,Mashtots Ave,1,Yerevan,40.0,http://www.myrealty.am/en/item/31190/1-senyaka...,1,stone,2,3.0
4996,542,newly repaired,Center,14,Argishti St,4,Yerevan,118.0,http://www.myrealty.am/en/item/25905/4-senyaka...,2,monolit,14,3.0


In [138]:
df = pd.read_csv('houses_train.csv', index_col='Unnamed: 0')

In [139]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(['price'], axis = 1), df['price'], test_size=0.1, random_state=1)
X_test.to_csv('houses_test.csv')
Y_test.to_csv('houses_test_answers.csv')

In [133]:
Y_test

Unnamed: 0
2813     80000.0
1813    122000.0
2652     85000.0
1254    127000.0
4189     60000.0
          ...   
3191     75000.0
3872     83000.0
1542    105000.0
2202    100000.0
6557     52000.0
Name: price, Length: 501, dtype: float64