In [37]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [38]:
import sqlalchemy

Server = "localhost"
Database = "Homeprices_Baku"
Driver = "ODBC Driver 17 for SQL Server" # in order to get this info, click Win + S and write 'odbc', then open drivers menu and scroll down
engine = sqlalchemy.create_engine(f'mssql://@{Server}/{Database}?driver={Driver}')

In [39]:
tbl = pd.read_sql('Select * from Houses', engine)
tbl

Unnamed: 0,ID,Location,Rooms,Area,GroundArea,Price
0,1,8-ci kilometr q.,6.0,538.0,4.0,900000
1,2,Mərdəkan q.,4.0,160.0,3.5,240000
2,3,Mərdəkan q.,5.0,200.0,6.0,390000
3,4,Hövsan q.,9.0,280.0,5.5,150000
4,5,Badamdar q.,12.0,360.0,8.0,950000
...,...,...,...,...,...,...
9963,9964,Mərdəkan q.,5.0,160.0,5.5,310000
9964,9965,Yasamal r.,1.0,28.0,1.0,67000
9965,9966,Mərdəkan q.,10.0,700.0,10.5,1400000
9966,9967,Hövsan q.,3.0,70.0,1.0,29500


In [40]:
tbl.isna().sum()

ID              0
Location      740
Rooms         694
Area            0
GroundArea     14
Price           0
dtype: int64

In [41]:
tbl.dropna(inplace = True)

In [42]:
tbl.dtypes

ID              int64
Location       object
Rooms         float64
Area          float64
GroundArea    float64
Price           int64
dtype: object

In [43]:
df = tbl.drop(['ID'], axis = 'columns')
df

Unnamed: 0,Location,Rooms,Area,GroundArea,Price
0,8-ci kilometr q.,6.0,538.0,4.0,900000
1,Mərdəkan q.,4.0,160.0,3.5,240000
2,Mərdəkan q.,5.0,200.0,6.0,390000
3,Hövsan q.,9.0,280.0,5.5,150000
4,Badamdar q.,12.0,360.0,8.0,950000
...,...,...,...,...,...
9963,Mərdəkan q.,5.0,160.0,5.5,310000
9964,Yasamal r.,1.0,28.0,1.0,67000
9965,Mərdəkan q.,10.0,700.0,10.5,1400000
9966,Hövsan q.,3.0,70.0,1.0,29500


In [44]:
location_stats = df.groupby('Location')['Location'].agg('count').sort_values(ascending = False)
location_stats

Location
Mərdəkan q.           1882
Şüvəlan q.             518
Masazır q.             483
Badamdar q.            439
Binə q.                424
                      ... 
Ağdam r.                 1
Aşağı Güzdək q.          1
Köhnə Corat q.           1
Müşfiqabad q.            1
4-cü mikrorayon q.       1
Name: Location, Length: 101, dtype: int64

In [45]:
location_stats_less_than_10 = location_stats[location_stats <= 10]
location_stats_less_than_10

Location
Balaxanı q.           10
Günəşli q.            10
Digah q.               9
Sahil q.               9
Şimal DRES q.          9
Xocəsən q.             9
Yeni Suraxanı q.       6
7-ci mikrorayon q.     6
Pirəkəşkül q.          6
Qaradağ r.             5
Xutor q.               5
Ağ şəhər q.            4
Dədə Qorqud q.         4
Zuğulba q.             3
Sea Breeze             3
Çiçək q.               2
Astara r.              2
Puta q.                2
Ceyranbatan q.         2
Bahar q.               2
Qobustan q.            2
3-cü mikrorayon q.     2
Böyükşor q.            2
Kubinka q.             1
Ağdam r.               1
Aşağı Güzdək q.        1
Köhnə Corat q.         1
Müşfiqabad q.          1
4-cü mikrorayon q.     1
Name: Location, dtype: int64

In [46]:
df['price_per_sqft'] = df['Price'] / df['Area']
df

Unnamed: 0,Location,Rooms,Area,GroundArea,Price,price_per_sqft
0,8-ci kilometr q.,6.0,538.0,4.0,900000,1672.862454
1,Mərdəkan q.,4.0,160.0,3.5,240000,1500.000000
2,Mərdəkan q.,5.0,200.0,6.0,390000,1950.000000
3,Hövsan q.,9.0,280.0,5.5,150000,535.714286
4,Badamdar q.,12.0,360.0,8.0,950000,2638.888889
...,...,...,...,...,...,...
9963,Mərdəkan q.,5.0,160.0,5.5,310000,1937.500000
9964,Yasamal r.,1.0,28.0,1.0,67000,2392.857143
9965,Mərdəkan q.,10.0,700.0,10.5,1400000,2000.000000
9966,Hövsan q.,3.0,70.0,1.0,29500,421.428571


In [47]:
df['Location'] = df['Location'].apply(lambda x : 'other' if x in location_stats_less_than_10 else x)
df[df['Location'] == 'other']

Unnamed: 0,Location,Rooms,Area,GroundArea,Price,price_per_sqft
79,other,5.0,278.0,3.2,339000,1219.424460
221,other,4.0,160.0,3.0,180000,1125.000000
280,other,4.0,160.0,3.0,180000,1125.000000
314,other,3.0,66.0,0.1,58000,878.787879
355,other,8.0,250.0,5.3,105000,420.000000
...,...,...,...,...,...,...
9780,other,4.0,150.0,5.0,130000,866.666667
9782,other,4.0,146.0,3.5,75000,513.698630
9811,other,5.0,100.0,3.0,220000,2200.000000
9889,other,4.0,100.0,3.5,56000,560.000000


In [48]:
df['price_per_sqft'].describe()

count     8550.000000
mean      1314.273644
std       1177.004309
min         27.500000
25%        727.272727
50%       1138.732959
75%       1650.000000
max      50000.000000
Name: price_per_sqft, dtype: float64

In [49]:
[*df.groupby('Location')]

[('20-ci sahə q.',
             Location  Rooms   Area  GroundArea   Price  price_per_sqft
  389   20-ci sahə q.    3.0  100.0         4.0   80000      800.000000
  3469  20-ci sahə q.    5.0  200.0         1.5  135000      675.000000
  4015  20-ci sahə q.   10.0  210.0         3.0  160000      761.904762
  4577  20-ci sahə q.    5.0  140.0         4.5  115000      821.428571
  4711  20-ci sahə q.    4.0  110.0         1.0  150000     1363.636364
  5431  20-ci sahə q.    3.0   60.0         0.5   57000      950.000000
  6382  20-ci sahə q.    2.0   65.0         1.0   68000     1046.153846
  6446  20-ci sahə q.    5.0  205.0         0.6  135000      658.536585
  6732  20-ci sahə q.    4.0  100.0         1.0   80000      800.000000
  7473  20-ci sahə q.    3.0   75.0         1.0   85000     1133.333333
  9721  20-ci sahə q.    6.0  152.0         0.3  143000      940.789474),
 ('6-cı mikrorayon q.',
                  Location  Rooms    Area  GroundArea    Price  price_per_sqft
  102   6-cı

In [50]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('Location'):
        m = np.mean(subdf['price_per_sqft'])
        sd = np.std(subdf['price_per_sqft'])
        reduced_df = subdf[(subdf["price_per_sqft"] > (m - sd)) & (subdf["price_per_sqft"] <= (m + sd))]
        df_out = pd.concat([df_out, reduced_df], ignore_index = True)
    return df_out
df = remove_pps_outliers(df)
df.shape

(7231, 6)

In [51]:
def remove_room_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('Location'):
        room_stats = {}
        for room, room_df in location_df.groupby('Rooms'):
            room_stats[room] = {
                'mean' : np.mean(room_df['price_per_sqft']),
                'std' : np.std(room_df['price_per_sqft']),
                'count' : room_df.shape[0]
            }
        for room, room_df in location_df.groupby('Rooms'):
            stats = room_stats.get(room - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, room_df[room_df['price_per_sqft'] < stats['mean']].index.values)
    return df.drop(exclude_indices, axis = 'index')

df = remove_room_outliers(df)
df.shape

(4596, 6)

In [52]:
df['Rooms'].unique()

array([ 3., 10.,  5.,  2.,  4.,  6., 11.,  8.,  9., 14.,  7., 15., 18.,
       12., 20.,  1., 13., 16.])

In [53]:
df[df['Rooms'] > 10]

Unnamed: 0,Location,Rooms,Area,GroundArea,Price,price_per_sqft
7,6-cı mikrorayon q.,11.0,330.0,6.3,460000,1393.939394
15,6-cı mikrorayon q.,11.0,330.0,6.2,460000,1393.939394
27,6-cı mikrorayon q.,11.0,330.0,6.3,460000,1393.939394
33,6-cı mikrorayon q.,14.0,800.0,12.0,1600000,2000.000000
39,6-cı mikrorayon q.,15.0,562.0,8.8,1200000,2135.231317
...,...,...,...,...,...,...
6613,other,11.0,600.0,10.0,700000,1166.666667
6821,Şüvəlan q.,12.0,450.0,10.0,590000,1311.111111
6846,Şüvəlan q.,12.0,750.0,11.0,1200000,1600.000000
7158,Şıxov q.,11.0,530.0,4.0,636000,1200.000000


In [54]:
df.drop(['price_per_sqft'], axis = 'columns', inplace = True)
df

Unnamed: 0,Location,Rooms,Area,GroundArea,Price
0,20-ci sahə q.,3.0,100.0,4.0,80000
1,20-ci sahə q.,10.0,210.0,3.0,160000
2,20-ci sahə q.,5.0,140.0,4.5,115000
3,20-ci sahə q.,3.0,60.0,0.5,57000
4,20-ci sahə q.,2.0,65.0,1.0,68000
...,...,...,...,...,...
7226,Əmircan q.,1.0,50.0,0.1,45000
7227,Əmircan q.,4.0,180.0,4.2,185000
7228,Əmircan q.,5.0,180.0,4.0,185000
7229,Əmircan q.,3.0,71.0,1.2,70000


In [55]:
df1 = pd.get_dummies(data = df, columns = ['Location'])
df1.drop(['Location_other'], axis = 'columns', inplace = True)
df1

Unnamed: 0,Rooms,Area,GroundArea,Price,Location_20-ci sahə q.,Location_6-cı mikrorayon q.,Location_8-ci kilometr q.,Location_8-ci mikrorayon q.,Location_9-cu mikrorayon q.,Location_Abşeron r.,...,Location_Yeni Ramana q.,Location_Yeni Yasamal q.,Location_Zabrat q.,Location_Zirə q.,Location_Zığ q.,Location_Şağan q.,Location_Şüvəlan q.,Location_Şıxov q.,Location_Əhmədli q.,Location_Əmircan q.
0,3.0,100.0,4.0,80000,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,10.0,210.0,3.0,160000,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,5.0,140.0,4.5,115000,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3.0,60.0,0.5,57000,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2.0,65.0,1.0,68000,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7226,1.0,50.0,0.1,45000,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7227,4.0,180.0,4.2,185000,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7228,5.0,180.0,4.0,185000,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7229,3.0,71.0,1.2,70000,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [56]:
df1.replace({True : 1, False : 0}, inplace = True)
df1

Unnamed: 0,Rooms,Area,GroundArea,Price,Location_20-ci sahə q.,Location_6-cı mikrorayon q.,Location_8-ci kilometr q.,Location_8-ci mikrorayon q.,Location_9-cu mikrorayon q.,Location_Abşeron r.,...,Location_Yeni Ramana q.,Location_Yeni Yasamal q.,Location_Zabrat q.,Location_Zirə q.,Location_Zığ q.,Location_Şağan q.,Location_Şüvəlan q.,Location_Şıxov q.,Location_Əhmədli q.,Location_Əmircan q.
0,3.0,100.0,4.0,80000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10.0,210.0,3.0,160000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.0,140.0,4.5,115000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,60.0,0.5,57000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,65.0,1.0,68000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7226,1.0,50.0,0.1,45000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7227,4.0,180.0,4.2,185000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7228,5.0,180.0,4.0,185000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7229,3.0,71.0,1.2,70000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [57]:
x = df1.drop(['Price'], axis = 'columns')
y = df1['Price']

In [58]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 10)

In [59]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
lr_clf = LinearRegression()
lr_clf.fit(x_train, y_train)
lr_clf.score(x_test, y_test)

0.8446001405536899

In [60]:
from sklearn.ensemble import RandomForestRegressor
rnd_clf = RandomForestRegressor()
rnd_clf.fit(x_train, y_train)
rnd_clf.score(x_test, y_test)

0.8763912228750887

In [61]:
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(x_train, y_train)
svm_clf.score(x_test, y_test)

0.008695652173913044

In [62]:
lasso_clf = Lasso()
lasso_clf.fit(x_train, y_train)
lasso_clf.score(x_test, y_test)

0.8446049758235619

In [63]:
ridge_clf = Ridge()
ridge_clf.fit(x_train, y_train)
ridge_clf.score(x_test, y_test)

0.8446241480495852

In [64]:
import pickle
with open('azerbaijan_house_prices_model.pickle', 'wb') as f:
    pickle.dump(rnd_clf, f)

In [65]:
# import json
# columns = {
#     'data_columns' : [col.lower() for col in x.columns]
# }
# with open('columns_house.json', 'w') as f:
#     f.write(json.dumps(columns))

In [66]:
def predict_price(location, Rooms, Area, ground_area):    
    try:
        loc_index = np.where(x.columns==f'Location_{location}')[0][0]
    except:
        loc_index = -1
        location = 'other'
    
    X = np.zeros(len(x.columns))
    X[0] = Rooms
    X[1] = Area
    X[2] = ground_area
    if loc_index >= 0:
        X[loc_index] = 1
        
    ans = rnd_clf.predict([X])[0]
    return ans

In [67]:
predict_price('8-ci kilometr q.', 6, 538, 4)

900000.0

In [68]:
x_train

Unnamed: 0,Rooms,Area,GroundArea,Location_20-ci sahə q.,Location_6-cı mikrorayon q.,Location_8-ci kilometr q.,Location_8-ci mikrorayon q.,Location_9-cu mikrorayon q.,Location_Abşeron r.,Location_Albalılıq q.,...,Location_Yeni Ramana q.,Location_Yeni Yasamal q.,Location_Zabrat q.,Location_Zirə q.,Location_Zığ q.,Location_Şağan q.,Location_Şüvəlan q.,Location_Şıxov q.,Location_Əhmədli q.,Location_Əmircan q.
3144,4.0,160.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2233,2.0,80.0,1.2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3240,4.0,240.0,9.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5105,6.0,310.0,5.3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4675,5.0,250.0,6.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3339,5.0,190.0,7.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1932,3.0,90.0,10.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5591,8.0,700.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2181,4.0,225.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
