In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import make_scorer, mean_squared_error , r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
import pickle

In [2]:
np.random.seed(101)

In [3]:
df = pd.read_csv('houses_train.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,100000.0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,52000.0,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,52000.0,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,130000.0,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,81600.0,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0


In [5]:
df1 = df.drop(columns=['condition', 'district', 'street', 'region', 'url', 'building_type', 'Unnamed: 0', 'max_floor'])

In [6]:
df1.head()

Unnamed: 0,price,num_rooms,area,num_bathrooms,floor,ceiling_height
0,100000.0,3,96.0,1,4,3.0
1,52000.0,3,78.0,1,10,2.8
2,52000.0,3,97.0,1,1,2.8
3,130000.0,3,80.0,1,2,3.2
4,81600.0,3,107.0,1,9,3.0


OneHotEncoding

In [7]:
ohe = OneHotEncoder(handle_unknown='ignore', min_frequency = 5)
ohe.fit(df[['condition', 'district', 'street', 'building_type']])
enc = ohe.fit_transform(df[['condition', 'district', 'street', 'building_type']]).astype('int').toarray()
labels = ohe.get_feature_names_out()

df2 = pd.DataFrame(enc, columns = labels)
encoded = pd.concat([df1, df2], axis=1)
encoded

Unnamed: 0,price,num_rooms,area,num_bathrooms,floor,ceiling_height,condition_good,condition_newly repaired,condition_zero condition,district_Achapnyak,...,street_Yekmalyan St,street_Z. Sarkavag St,street_Z.Andranik St,street_Zakyan St,street_Zavaryan St,street_infrequent_sklearn,building_type_monolit,building_type_other,building_type_panel,building_type_stone
0,100000.0,3,96.0,1,4,3.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,52000.0,3,78.0,1,10,2.8,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,52000.0,3,97.0,1,1,2.8,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
3,130000.0,3,80.0,1,2,3.2,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,81600.0,3,107.0,1,9,3.0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,70000.0,3,97.0,1,4,2.8,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4997,77000.0,3,71.0,1,4,2.8,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4998,46000.0,1,40.0,1,2,3.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4999,99000.0,4,118.0,2,14,3.0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


train/test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(encoded[encoded.columns.difference(['price'])], encoded['price'], test_size=0.2, shuffle=True)

In [9]:
from sklearn.preprocessing import PolynomialFeatures

deg = 2
poly = PolynomialFeatures(degree = deg)

X_Poly_train = poly.fit_transform(X_train)
X_Poly_test = poly.fit_transform(X_test)

Regression Model

In [10]:
regr = linear_model.Ridge(alpha=27.825594022071243)
regr.fit(X_Poly_train, y_train)
y_pred = regr.predict(X_Poly_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
filename = 'saved_model.sav'
pickle.dump(regr, open(filename, 'wb'))

In [11]:
rmse

18868.03956761459

In [12]:
model = pickle.load(open(filename, 'rb'))

In [13]:
def final_predict(final_test_df):
    
    test_df1 = final_test_df.drop(columns=['condition', 'district', 'street', 'region', 'url', 'building_type', 'Unnamed: 0', 'max_floor'])
    test_enc = ohe.transform(final_test_df[['condition', 'district', 'street', 'building_type']]).astype('int').toarray()
    
    test_df2 = pd.DataFrame(test_enc, columns=labels)
    test_encoded = pd.concat([test_df1, test_df2], axis=1)
    
    test_pred_data = test_encoded[test_encoded.columns.difference(['price'])]
    poly_data = poly.fit_transform(test_pred_data)
    
    predictions = model.predict(poly_data)
    return predictions

In [None]:
df = pd.read_csv('houses_test.csv')
final_predict(df)