<img src='data/kaggle.jpg'>

In [43]:
import json

import joblib
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

path_data = "data/russiarealestate.zip"
path_unique_values = "data/unique_values.json"
path_model = "models/lr_pipeline.sav"
drop_cols = ["date", "time", "geo_lat", "geo_lon", "region"]
categorical_features = ["building_type", "object_type"]
numeric_features = ["level","levels","rooms","area","kitchen_area"]
passthrough_feats = ["price"]


In [44]:
df = pd.read_csv(path_data)

In [45]:
df

Unnamed: 0,price,date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
0,6050000,2018-02-19,20:00:21,59.805808,30.376141,2661,1,8,10,3,82.6,10.8,1
1,8650000,2018-02-27,12:04:54,55.683807,37.297405,81,3,5,24,2,69.1,12.0,1
2,4000000,2018-02-28,15:44:00,56.295250,44.061637,2871,1,5,9,3,66.0,10.0,1
3,1850000,2018-03-01,11:24:52,44.996132,39.074783,2843,4,12,16,2,38.0,5.0,11
4,5450000,2018-03-01,17:42:43,55.918767,37.984642,81,3,13,14,2,60.0,10.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5477001,19739760,2021-05-01,20:13:58,55.804736,37.750898,3,1,8,17,4,93.2,13.8,11
5477002,12503160,2021-05-01,20:14:01,55.841415,37.489624,3,2,17,32,2,45.9,6.6,11
5477003,8800000,2021-05-01,20:14:04,56.283909,44.075408,2871,2,4,17,3,86.5,11.8,1
5477004,11831910,2021-05-01,20:14:12,55.804736,37.750898,3,1,8,33,2,52.1,18.9,11


In [46]:
df = df.drop(columns=drop_cols)

In [47]:
df[categorical_features] = df[categorical_features].astype(str)

In [48]:
df.describe()

Unnamed: 0,price,level,levels,rooms,area,kitchen_area
count,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0
mean,4422029.0,6.21453,11.39892,1.726173,53.91825,10.6284
std,21507520.0,4.957419,6.535734,1.082133,33.35293,9.79238
min,-2144967000.0,1.0,1.0,-2.0,0.07,0.01
25%,1950000.0,2.0,5.0,1.0,38.0,7.0
50%,2990000.0,5.0,10.0,2.0,48.02,9.7
75%,4802000.0,9.0,16.0,2.0,63.13,12.7
max,2147484000.0,39.0,39.0,10.0,7856.0,9999.0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5477006 entries, 0 to 5477005
Data columns (total 8 columns):
 #   Column         Dtype  
---  ------         -----  
 0   price          int64  
 1   building_type  object 
 2   level          int64  
 3   levels         int64  
 4   rooms          int64  
 5   area           float64
 6   kitchen_area   float64
 7   object_type    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 334.3+ MB


In [50]:
df

Unnamed: 0,price,building_type,level,levels,rooms,area,kitchen_area,object_type
0,6050000,1,8,10,3,82.6,10.8,1
1,8650000,3,5,24,2,69.1,12.0,1
2,4000000,1,5,9,3,66.0,10.0,1
3,1850000,4,12,16,2,38.0,5.0,11
4,5450000,3,13,14,2,60.0,10.0,1
...,...,...,...,...,...,...,...,...
5477001,19739760,1,8,17,4,93.2,13.8,11
5477002,12503160,2,17,32,2,45.9,6.6,11
5477003,8800000,2,4,17,3,86.5,11.8,1
5477004,11831910,1,8,33,2,52.1,18.9,11


In [51]:
# df[df.price.between(df.price.quantile(0.05), df.price.quantile(0.95))].shape

(4950591, 8)

In [52]:
# df[df.area.between(df.area.quantile(0.01), df.area.quantile(0.99))].shape

(0, 8)

In [53]:
# remove outliers
df = df[df.price.between(df.price.quantile(0.05), df.price.quantile(0.95))]
df = df[df.area.between(df.area.quantile(0.01), df.area.quantile(0.99))]
df = df[df.rooms > -2]

In [54]:
df.shape

(4858729, 8)

In [42]:
df.shape

(0, 8)

In [22]:
df

Unnamed: 0,price,building_type,level,levels,rooms,area,kitchen_area,object_type


In [55]:
y = df["price"]
X = df.drop(columns="price", axis=1)

In [21]:
y

Series([], Name: price, dtype: int64)

In [56]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state = 42)


In [58]:
# create pipeline
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown="ignore", drop="first"), categorical_features)
)

In [59]:
clf = make_pipeline(preprocessor, LinearRegression())



In [60]:
#train
clf.fit(X_train,y_train)
y_prediction = clf.predict(X_test)


In [61]:
print(mean_absolute_error(y_test, y_prediction))

1257689.4209031723


In [63]:
#Saving the model
joblib.dump(clf, path_model)

#save unique values
dict_unique = {key: X[key].unique().tolist() for key in X.columns}

In [None]:
with open(path_unique_values, 'w') as file:
    json.dump(dict_unique, file)