# Составление лучших моделей для регрессии

In [86]:
import streamlit as st
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor

DATASET_PATH = './kc_house_data.csv'
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1,1180,5650,1,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2,2570,7242,2,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,180000.0,2,1,770,10000,1,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3,1960,5000,1,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2,1680,8080,1,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17645,360000.0,3,2,1530,1131,3,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
17646,400000.0,4,2,2310,5813,2,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
17647,402101.0,2,0,1020,1350,2,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
17648,400000.0,3,2,1600,2388,2,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [87]:
BASE_FEATURES = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
                 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

TARGET_NAME = 'price'

X = df[BASE_FEATURES]
y = df[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def display_metrics(y_test, y_pred):
    print(f'MSE for predicted values on y_test : {mean_squared_error(y_test, y_pred)}')
    print(f'MAE for predicted values on y_test : {mean_absolute_error(y_test, y_pred)}')
    print(f'RMSE for predicted values on y_test : {root_mean_squared_error(y_test, y_pred)}')
    print(f'R2 score for predicted values on y_test : {r2_score(y_test, y_pred)}')

## DecisionTreeRegressor

In [88]:
optimal_dt = DecisionTreeRegressor(max_depth=9, max_features=18, min_samples_leaf=7, 
                                   min_samples_split=27, max_leaf_nodes=80, min_impurity_decrease=0.001, ccp_alpha=0.05)

optimal_dt.fit(X_train, y_train)

y_pred = optimal_dt.predict(X_test)

display_metrics(y_test, y_pred)

MSE for predicted values on y_test : 8676854271.830341
MAE for predicted values on y_test : 67117.5163824548
RMSE for predicted values on y_test : 93149.63377185303
R2 score for predicted values on y_test : 0.7821424088103205


## GradientBoostingRegressor

In [89]:
optimal_gbr = GradientBoostingRegressor(max_depth=5, max_leaf_nodes=160, max_features=6, 
                                        n_estimators=150, ccp_alpha=50, learning_rate=0.027351263125863064,
                                        min_samples_split=12, min_samples_leaf=16)
optimal_gbr.fit(X_train, y_train)
y_pred = optimal_gbr.predict(X_test)

display_metrics(y_test, y_pred)

MSE for predicted values on y_test : 6253043935.534187
MAE for predicted values on y_test : 56130.93578888569
RMSE for predicted values on y_test : 79076.19069943992
R2 score for predicted values on y_test : 0.8429991968608519


In [90]:
optimal_gbr = GradientBoostingRegressor(max_depth=5, max_leaf_nodes=160, max_features=6, 
                                        n_estimators=150, ccp_alpha=50, learning_rate=0.027351263125863064,
                                        min_samples_split=12, min_samples_leaf=16)
optimal_gbr.fit(X_train, y_train)
y_pred = optimal_gbr.predict(X_train)

display_metrics(y_train, y_pred)

MSE for predicted values on y_test : 5267482894.915397
MAE for predicted values on y_test : 51514.75684718883
RMSE for predicted values on y_test : 72577.42689649032
R2 score for predicted values on y_test : 0.8641687873097621


## CatBoostRegressor

In [91]:
optimal_catboost_regressor = CatBoostRegressor(iterations=400, learning_rate=0.07518469735486523, depth=6,
                                               l2_leaf_reg=10,
                                               subsample=0.8, colsample_bylevel=0.7599054732685009, verbose=False)

optimal_catboost_regressor.fit(X_train, y_train)

y_pred = optimal_catboost_regressor.predict(X_test)

display_metrics(y_test, y_pred)

MSE for predicted values on y_test : 4778821600.685454
MAE for predicted values on y_test : 49020.90052679581
RMSE for predicted values on y_test : 69129.02140697099
R2 score for predicted values on y_test : 0.8800138241308821


In [92]:
optimal_catboost_regressor = CatBoostRegressor(iterations=400, learning_rate=0.07518469735486523, depth=6,
                                               l2_leaf_reg=10,
                                               subsample=0.8, colsample_bylevel=0.7599054732685009, verbose=False)

optimal_catboost_regressor.fit(X_train, y_train)

y_pred = optimal_catboost_regressor.predict(X_train)

display_metrics(y_train, y_pred)

MSE for predicted values on y_test : 3698993934.112263
MAE for predicted values on y_test : 43639.086434386
RMSE for predicted values on y_test : 60819.354930090005
R2 score for predicted values on y_test : 0.9046150045804044


## Bagging

In [93]:
optimal_randomForestRegressor = RandomForestRegressor(max_depth=15, max_leaf_nodes=100, max_features=6,
                                                      n_estimators=200, ccp_alpha=70, min_samples_split=16,
                                                      min_samples_leaf=3)

optimal_randomForestRegressor.fit(X_train, y_train)

y_pred = optimal_randomForestRegressor.predict(X_test)

display_metrics(y_test, y_pred)

MSE for predicted values on y_test : 6885568344.955077
MAE for predicted values on y_test : 59420.53239430743
RMSE for predicted values on y_test : 82979.32480416479
R2 score for predicted values on y_test : 0.8271178371090254


In [94]:
optimal_randomForestRegressor = RandomForestRegressor(max_depth=15, max_leaf_nodes=100, max_features=6,
                                                      n_estimators=200, ccp_alpha=70, min_samples_split=16,
                                                      min_samples_leaf=3)

optimal_randomForestRegressor.fit(X_train, y_train)

y_pred = optimal_randomForestRegressor.predict(X_train)

display_metrics(y_train, y_pred)

MSE for predicted values on y_test : 5866502530.709727
MAE for predicted values on y_test : 55361.62010406196
RMSE for predicted values on y_test : 76593.09714791358
R2 score for predicted values on y_test : 0.8487220236128644


## Stacking

In [95]:
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

lr2 = LinearRegression()
lr2.fit(X_train_poly, y_train)

y_pred = lr2.predict(X_test_poly)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

display_metrics(y_test, y_pred)

MSE for predicted values on y_test : 10749878741.655386
MAE for predicted values on y_test : 74709.8737267578
RMSE for predicted values on y_test : 103681.62200532641
R2 score for predicted values on y_test : 0.7300931172900562


In [96]:
estimators = [
    ('boost', optimal_catboost_regressor),
    ('svm', LinearRegression()),
    ('lr', optimal_randomForestRegressor)
]

stackingRegressor = StackingRegressor(estimators=estimators, final_estimator=optimal_dt)

stackingRegressor.fit(X_train, y_train)

y_pred = stackingRegressor.predict(X_test)

display_metrics(y_test, y_pred)

MSE for predicted values on y_test : 4934263034.8717
MAE for predicted values on y_test : 49974.29787948697
RMSE for predicted values on y_test : 70244.30962627294
R2 score for predicted values on y_test : 0.876111016112909


In [97]:
stackingRegressor = StackingRegressor(estimators=estimators, final_estimator=optimal_dt)

stackingRegressor.fit(X_train, y_train)

y_pred = stackingRegressor.predict(X_train)

display_metrics(y_train, y_pred)

MSE for predicted values on y_test : 3875436761.8529744
MAE for predicted values on y_test : 44701.726579527276
RMSE for predicted values on y_test : 62253.00604672014
R2 score for predicted values on y_test : 0.9000651192289684


# Итоговые модели

In [98]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
4013,3,2,2220,10275,2,0,0,3,9,1640,580,1980,0,98027,47.5304,-122.055,2300,9975
8156,3,1,1120,8443,1,0,0,3,7,1120,0,1953,0,98133,47.7715,-122.336,1450,8433
1043,4,2,2963,5797,2,0,0,3,9,2963,0,2006,0,98030,47.3831,-122.185,2665,6119
4800,4,2,3190,7869,2,0,2,3,9,3190,0,2001,0,98065,47.5317,-121.866,2630,6739
9189,3,1,910,11117,1,0,0,3,7,910,0,1955,0,98003,47.3432,-122.309,1490,8416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,3,1,940,7980,1,0,0,3,6,940,0,1961,0,98146,47.5107,-122.345,1050,7980
11964,3,1,1190,4500,1,0,0,3,6,1190,0,1922,2012,98116,47.5576,-122.388,1820,4500
5390,2,1,1530,6450,1,0,0,4,7,1530,0,1920,0,98117,47.6833,-122.398,1530,5000
860,2,1,1500,8645,1,0,0,4,7,1500,0,1963,0,98042,47.3643,-122.115,1220,8645


In [99]:
final_classic = DecisionTreeRegressor(max_depth=9, max_features=18, min_samples_leaf=7, 
                                   min_samples_split=27, max_leaf_nodes=80, min_impurity_decrease=0.001, ccp_alpha=0.05)

final_classic.fit(X_train, y_train)

final_gradient_boost = GradientBoostingRegressor(max_depth=5, max_leaf_nodes=160, max_features=6, 
                                        n_estimators=150, ccp_alpha=50, learning_rate=0.027351263125863064,
                                        min_samples_split=12, min_samples_leaf=16)

final_gradient_boost.fit(X_train, y_train)

final_boost_catboost = CatBoostRegressor(iterations=400, learning_rate=0.07518469735486523, depth=6,
                                               l2_leaf_reg=10,
                                               subsample=0.8, colsample_bylevel=0.7599054732685009, verbose=False)

final_boost_catboost.fit(X_train, y_train)

final_bagging = RandomForestRegressor(max_depth=15, max_leaf_nodes=100, max_features=6,
                                                      n_estimators=200, ccp_alpha=70, min_samples_split=16,
                                                      min_samples_leaf=3)

final_bagging.fit(X_train, y_train)

final_stacking = StackingRegressor(estimators=[
    ('boost', optimal_catboost_regressor),
    ('svm', LinearRegression()),
    ('lr', optimal_randomForestRegressor)
    ], final_estimator=optimal_dt)

final_stacking.fit(X_train, y_train)

models = {
    "decision_tree.pkl": final_classic,
    "gradient_boost.pkl": final_gradient_boost,
    "random_forest.pkl": final_bagging,
    "stacking.pkl": final_stacking,
}

for filename, model in models.items():
    with open(filename, "wb") as file:
        pickle.dump(model, file)

final_boost_catboost.save_model("catboost.cbm")

In [101]:
with open("./models/decision_tree.pkl", "rb") as file:
    loaded_tree = pickle.load(file)

# Преобразование Датасета!

Удаляем столбец с датой

In [None]:
# FEATURES = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
#                  'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

# new_df = df[FEATURES]
# new_df.to_csv('kc_house_data.csv', index=False)

<font size="50">
ЗАМЕТКА: Сделать опцию ручного ввода данных, организовать вкладки со страницами
</font>

In [2]:
import pandas as pd
kc_house = pd.read_csv('kc_house_data.csv')
kc_house

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1,1180,5650,1,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2,2570,7242,2,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,180000.0,2,1,770,10000,1,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3,1960,5000,1,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2,1680,8080,1,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17645,360000.0,3,2,1530,1131,3,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
17646,400000.0,4,2,2310,5813,2,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
17647,402101.0,2,0,1020,1350,2,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
17648,400000.0,3,2,1600,2388,2,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287
