In [None]:
import xgboost as xgb
import lightgbm as lgbm
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LarsCV
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
import pandas as pd
import numpy as np
from geopy.distance import geodesic 
import math
from sklearn.metrics import mean_absolute_error, r2_score, median_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 
import warnings
warnings.filterwarnings('ignore')

In [None]:
num_folds = 10
n_iter = 1000
n_estimators = 100
scoring = 'r2'

models = []
models.append(('R', Ridge()))
models.append(('L', Lasso()))
models.append(('ELN', ElasticNet()))
models.append(('LARS', LarsCV()))
models.append(('BR', BayesianRidge(n_iter=n_iter)))
models.append(('KNR', KNeighborsRegressor()))
models.append(('DTR', DecisionTreeRegressor()))
models.append(('LSVR', LinearSVR()))
models.append(('SVR', SVR()))
models.append(('ABR', AdaBoostRegressor(n_estimators=n_estimators)))
models.append(('BR', BaggingRegressor(n_estimators=n_estimators)))
models.append(('ETR', ExtraTreesRegressor(n_estimators=n_estimators)))
models.append(('GBR', GradientBoostingRegressor(n_estimators=n_estimators)))
models.append(('RFR', RandomForestRegressor(n_estimators=n_estimators)))
models.append(('XGB', xgb.XGBRegressor(n_estimators=n_estimators)))
models.append(('LGBM', lgbm.LGBMRegressor(n_estimators=n_estimators)))


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/MaltsevaNata/FlatPrices_Vahta/master/prediction_model/flat_msk_complete.csv')

def get_azimuth(lat, lng):
 
    rad = 6372795

    llat1 = city_center_coordinates[0]
    llong1 = city_center_coordinates[1]
    llat2 = lat
    llong2 = lng

    lat1 = llat1*math.pi/180.
    lat2 = llat2*math.pi/180.
    long1 = llong1*math.pi/180.
    long2 = llong2*math.pi/180.

    cl1 = math.cos(lat1)
    cl2 = math.cos(lat2)
    sl1 = math.sin(lat1)
    sl2 = math.sin(lat2)
    delta = long2 - long1
    cdelta = math.cos(delta)
    sdelta = math.sin(delta)

    y = math.sqrt(math.pow(cl2*sdelta,2)+math.pow(cl1*sl2-sl1*cl2*cdelta,2))
    x = sl1*sl2+cl1*cl2*cdelta
    ad = math.atan2(y,x)

    x = (cl1*sl2) - (sl1*cl2*cdelta)
    y = sdelta*cl2
    z = math.degrees(math.atan(-y/x))

    if (x < 0):
        z = z+180.

    z2 = (z+180.) % 360. - 180.
    z2 = - math.radians(z2)
    anglerad2 = z2 - ((2*math.pi)*math.floor((z2/(2*math.pi))) )
    angledeg = (anglerad2*180.)/math.pi
    
    return round(angledeg, 2)

from geopy.distance import geodesic
import math
import numpy as np
from sklearn.preprocessing import LabelEncoder

city_center_coordinates = [55.7522, 37.6156]
df['distance'] = list(map(lambda x, y: geodesic(city_center_coordinates, [x, y]).meters, df['lat'], df['lng']))
df['azimuth'] = list(map(lambda x, y: get_azimuth(x, y), df['lat'], df['lng']))     

df = df.loc[(df['distance'] < 40000)] 
df['distance'] = df['distance'].round(0)
df['azimuth'] = df['azimuth'].round(0)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5181 entries, 0 to 5180
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   total_area     5181 non-null   float64
 1   living_area    5181 non-null   float64
 2   kitchen_area   5181 non-null   float64
 3   floor_number   5181 non-null   int64  
 4   total_floors   5181 non-null   int64  
 5   year           5181 non-null   float64
 6   material_type  5181 non-null   int64  
 7   lng            5181 non-null   float64
 8   lat            5181 non-null   float64
 9   underground    5181 non-null   int64  
 10  price          5181 non-null   int64  
 11  distance       5181 non-null   float64
 12  azimuth        5181 non-null   float64
dtypes: float64(8), int64(5)
memory usage: 566.7 KB


In [None]:
first_quartile = df.quantile(q=0.25)
third_quartile = df.quantile(q=0.75)
IQR = third_quartile - first_quartile
outliers = df[(df > (third_quartile + 1.5 * IQR)) | (df < (first_quartile - 1.5 * IQR))].count(axis=1)
outliers.sort_values(axis=0, ascending=False, inplace=True)

# Deleting some lines
outliers = outliers.head(100)
df.drop(outliers.index, inplace=True)

categorical_columns = df.columns[df.dtypes == 'object']
labelencoder = LabelEncoder()
for column in categorical_columns:
    df[column] = labelencoder.fit_transform(df[column])
    print(dict(enumerate(labelencoder.classes_)))

df.info()

{0: 'block', 1: 'brick', 2: 'monolith', 3: 'monolithBrick', 4: 'old', 5: 'panel', 6: 'stalin', 7: 'wireframe'}
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5181 entries, 0 to 9831
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   total_area     5181 non-null   float64
 1   living_area    5181 non-null   float64
 2   kitchen_area   5181 non-null   float64
 3   floor_number   5181 non-null   int64  
 4   total_floors   5181 non-null   int64  
 5   year           5181 non-null   float64
 6   material_type  5181 non-null   int64  
 7   lng            5181 non-null   float64
 8   lat            5181 non-null   float64
 9   underground    5181 non-null   int64  
 10  price          5181 non-null   int64  
 11  distance       5181 non-null   float64
 12  azimuth        5181 non-null   float64
dtypes: float64(8), int64(5)
memory usage: 566.7 KB


In [None]:
y = df['price']
features = [
            'total_area',
            'living_area',
            'kitchen_area',
            'floor_number',
            'total_floors',
            'year',
            'material_type',
            'underground',
            'distance',
            'azimuth'
           ]

X = df[features]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.15)

In [None]:
scores = []
names = []
results = []
predictions = []
msg_row = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=None)
    cv_results = cross_val_score(model,train_X, train_y, cv=kfold, scoring=scoring)
    names.append(name)
    results.append(cv_results)
    m_fit = model.fit(train_X, train_y)
    m_predict = model.predict(val_X)
    predictions.append(m_predict)
    m_score = model.score(val_X, val_y)
    scores.append(m_score)
    msg = "%s: train = %.3f (%.3f) / test = %.3f" % (name, cv_results.mean(), cv_results.std(), m_score)
    msg_row.append(msg)
    print(msg)

R: train = 0.370 (0.179) / test = 0.574
L: train = 0.370 (0.179) / test = 0.574
ELN: train = 0.370 (0.179) / test = 0.574
LARS: train = 0.365 (0.172) / test = 0.569
BR: train = 0.371 (0.178) / test = 0.574
KNR: train = 0.453 (0.205) / test = 0.567
DTR: train = -0.249 (0.922) / test = -0.541
LSVR: train = 0.135 (0.030) / test = 0.191
SVR: train = -0.082 (0.026) / test = -0.116
ABR: train = -0.149 (1.058) / test = 0.377
BR: train = 0.373 (0.365) / test = 0.668
ETR: train = 0.263 (0.646) / test = 0.635
GBR: train = 0.214 (0.544) / test = 0.505
RFR: train = 0.352 (0.422) / test = 0.645
XGB: train = 0.182 (0.488) / test = 0.583
LGBM: train = 0.376 (0.329) / test = 0.651


Best result: BR, ETR, RFR, LGBM

Improving learning parameters for this methods:

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

#Вычисляет медианную абсолютную процентную ошибку
def median_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.median(np.abs((y_true - y_pred) / y_true)) * 100

#Печатает рассчитанные значения коэффициента детерминации, средней и медианной абсолютных ошибок
def print_metrics(prediction, val_y):
    val_mae = mean_absolute_error(val_y, prediction)
    median_AE = median_absolute_error(val_y, prediction)
    r2 = r2_score(val_y, prediction)

    print('')
    print('R\u00b2: {:.2}'.format(r2))
    print('')
    print('Mean absolute error: {:.3} %'.format(mean_absolute_percentage_error(val_y, prediction)))
    print('Median absolute error: {:.3} %'.format(median_absolute_percentage_error(val_y, prediction)))

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, 
                                 n_jobs=-1,  
                                 bootstrap=True,
                                 criterion='mse',
                                 max_features=3,
                                 random_state=7,
                                 max_depth=55,
                                 min_samples_split=2
                                 )

rf_model.fit(train_X, train_y)

rf_prediction = rf_model.predict(val_X).round(0)

print_metrics(rf_prediction, val_y)


R²: 0.67

Mean absolute error: 1.41e+02 %
Median absolute error: 7.93 %


In [None]:
br_model = BaggingRegressor(n_estimators=2000,
                            max_samples=0.6,
                            max_features=1.0)

br_model.fit(train_X, train_y)

br_prediction = br_model.predict(val_X).round(0)

print_metrics(br_prediction, val_y)




R²: 0.71

Mean absolute error: 1.44e+02 %
Median absolute error: 8.65 %


In [None]:
type(br_model)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=0.6, n_estimators=2000,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [None]:
etr_model = ExtraTreesRegressor(n_estimators=1000,
                                max_depth=50,
                                min_samples_split=5,
                                max_samples=0.7)
etr_model.fit(train_X, train_y)

etr_prediction = etr_model.predict(val_X).round(0)

print_metrics(etr_prediction, val_y)


R²: 0.67

Mean absolute error: 1.45e+02 %
Median absolute error: 7.24 %


In [None]:
lgbm_model = lgbm.LGBMRegressor(boosting_type='dart',
                                     num_leaves=50,
                                     max_depth=-1,
                                     n_estimators=100)
lgbm_model.fit(train_X, train_y)

lgbm_prediction = lgbm_model.predict(val_X).round(0)

print_metrics(lgbm_prediction, val_y)


R²: 0.7

Mean absolute error: 1.31e+02 %
Median absolute error: 11.6 %


In [None]:
import pickle
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [77]:
with open('/content/gdrive/My Drive/pickle_model.pkl', 'wb') as f:
    pickle.dump(br_model, f)

# Load from file
with open('/content/gdrive/My Drive/pickle_model.pkl', 'rb') as f:
    pickle_model = pickle.load(f)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(val_X, val_y)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(val_X)

Test score: 70.51 %


In [None]:
from sklearn.externals import joblib

In [78]:
with open('/content/gdrive/My Drive/moscow_model.pkl', 'wb') as f:
  joblib.dump(br_model, f)

# Load from file
with open('/content/gdrive/My Drive/moscow_model.pkl', 'rb') as f:
  joblib_model = joblib.load(f)

# Calculate the accuracy and predictions
score = joblib_model.score(val_X, val_y)
print("Test score: {0:.2f} %".format(100 * score))


Test score: 70.51 %
