In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from catboost import Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor



In [4]:
df_train = pd.read_csv('/kaggle/input/sber-predict5/train.csv', parse_dates=['timestamp'])
df_test = pd.read_csv('/kaggle/input/sber-predict5/test.csv', parse_dates=['timestamp'])
sample_submission = pd.read_csv('/kaggle/input/sber-predict5/sample_submission.csv')
macro = pd.read_csv('/kaggle/input/sber-predict5/macro.csv', parse_dates=['timestamp'])

In [42]:
df_train.shape, df_test.shape

((30471, 292), (7662, 291))

In [5]:
df_train['timestamp'] = df_train['timestamp'].astype('int64')
df_test['timestamp'] = df_test['timestamp'].astype('int64')

In [46]:
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30471 entries, 0 to 30470
Columns: 292 entries, id to price_doc
dtypes: float64(119), int64(158), object(15)
memory usage: 67.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7662 entries, 0 to 7661
Columns: 291 entries, id to market_count_5000
dtypes: float64(116), int64(160), object(15)
memory usage: 17.0+ MB


In [47]:
missing_sum1 = df_train.isnull().sum().sort_values(ascending=False)

missing_sum1.head(40)

hospital_beds_raion                      14441
build_year                               13605
state                                    13559
cafe_sum_500_min_price_avg               13281
cafe_sum_500_max_price_avg               13281
cafe_avg_price_500                       13281
kitch_sq                                  9572
material                                  9572
max_floor                                 9572
num_room                                  9572
preschool_quota                           6688
school_quota                              6685
cafe_sum_1000_min_price_avg               6524
cafe_sum_1000_max_price_avg               6524
cafe_avg_price_1000                       6524
life_sq                                   6383
build_count_foam                          4991
build_count_slag                          4991
build_count_panel                         4991
build_count_monolith                      4991
build_count_1971-1995                     4991
build_count_b

In [48]:
missing_sum2 = df_test.isnull().sum().sort_values(ascending=False)

missing_sum2.head(40)

hospital_beds_raion                      3418
cafe_sum_500_min_price_avg               3159
cafe_sum_500_max_price_avg               3159
cafe_avg_price_500                       3159
preschool_quota                          1596
school_quota                             1595
cafe_sum_1000_min_price_avg              1222
cafe_avg_price_1000                      1222
cafe_sum_1000_max_price_avg              1222
build_count_slag                         1218
build_count_mix                          1218
raion_build_count_with_builddate_info    1218
build_count_before_1920                  1218
build_count_panel                        1218
build_count_1921-1945                    1218
build_count_foam                         1218
build_count_1946-1970                    1218
build_count_1971-1995                    1218
build_count_after_1995                   1218
build_count_monolith                     1218
build_count_frame                        1218
build_count_wood                  

In [6]:
missing_remained1 = df_train.isnull().sum().sort_values(ascending=False).head(20)
missing_remained1

hospital_beds_raion            14441
build_year                     13605
state                          13559
cafe_sum_500_min_price_avg     13281
cafe_sum_500_max_price_avg     13281
cafe_avg_price_500             13281
kitch_sq                        9572
material                        9572
max_floor                       9572
num_room                        9572
preschool_quota                 6688
school_quota                    6685
cafe_sum_1000_min_price_avg     6524
cafe_sum_1000_max_price_avg     6524
cafe_avg_price_1000             6524
life_sq                         6383
build_count_foam                4991
build_count_slag                4991
build_count_panel               4991
build_count_monolith            4991
dtype: int64

In [7]:
missing_remained2 = df_test.isnull().sum().sort_values(ascending=False).head(20)
missing_remained2

hospital_beds_raion                      3418
cafe_sum_500_min_price_avg               3159
cafe_sum_500_max_price_avg               3159
cafe_avg_price_500                       3159
preschool_quota                          1596
school_quota                             1595
cafe_sum_1000_min_price_avg              1222
cafe_avg_price_1000                      1222
cafe_sum_1000_max_price_avg              1222
build_count_slag                         1218
build_count_mix                          1218
raion_build_count_with_builddate_info    1218
build_count_before_1920                  1218
build_count_panel                        1218
build_count_1921-1945                    1218
build_count_foam                         1218
build_count_1946-1970                    1218
build_count_1971-1995                    1218
build_count_after_1995                   1218
build_count_monolith                     1218
dtype: int64

In [8]:
for name in df_train.select_dtypes("number"):
        df_train[name] = df_train[name].fillna(0)
        
for name in df_train.select_dtypes("object"):
        df_train[name] = df_train[name].fillna("None")

In [9]:
for name in df_test.select_dtypes("number"):
        df_test[name] = df_test[name].fillna(0)
        
for name in df_test.select_dtypes("object"):
        df_test[name] = df_test[name].fillna("None")

In [10]:
df_train.isnull().sum().sort_values(ascending=False).head(10)

price_doc                      0
id                             0
timestamp                      0
full_sq                        0
cafe_sum_3000_min_price_avg    0
cafe_sum_3000_max_price_avg    0
cafe_avg_price_3000            0
cafe_count_3000_na_price       0
cafe_count_3000_price_500      0
cafe_count_3000_price_1000     0
dtype: int64

In [11]:
df_test.isnull().sum().sort_values(ascending=False).head(10)

market_count_5000              0
id                             0
timestamp                      0
cafe_count_3000                0
cafe_sum_3000_min_price_avg    0
cafe_sum_3000_max_price_avg    0
cafe_avg_price_3000            0
cafe_count_3000_na_price       0
cafe_count_3000_price_500      0
cafe_count_3000_price_1000     0
dtype: int64

In [12]:
df_train = pd.get_dummies(df_train)
df_train['price_doc'].head()

0     5850000
1     6000000
2     5700000
3    13100000
4    16331452
Name: price_doc, dtype: int64

In [73]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30471 entries, 0 to 30470
Columns: 454 entries, id to ecology_satisfactory
dtypes: bool(177), float64(119), int64(158)
memory usage: 69.5 MB


In [14]:
df_test = pd.get_dummies(df_test)

In [75]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7662 entries, 0 to 7661
Columns: 453 entries, id to ecology_satisfactory
dtypes: bool(177), float64(116), int64(160)
memory usage: 17.4 MB


In [15]:
df_train['price_doc'].shape

(30471,)

In [16]:
X = df_train.drop('price_doc', axis=1)

target = df_train['price_doc']
y = np.log(target)

In [77]:
y.head(40)

0     15.581952
1     15.607270
2     15.555977
3     16.388123
4     16.608603
5     16.023785
6     15.520259
7     14.508658
8     15.483217
9     14.508658
10    15.352378
11    15.384126
12    15.444751
13    15.464169
14    15.424948
15    14.430696
16    15.656060
17    15.590463
18    15.882373
19    15.464169
20    15.919645
21    15.464169
22    15.648092
23    15.564710
24    15.607270
25    13.864301
26    15.424948
27    15.363073
28    15.926257
29    15.590463
30    15.640060
31    16.722412
32    15.404746
33    15.297115
34    15.176487
35    15.394489
36    15.555977
37    15.274126
38    15.319588
39    14.508658
Name: price_doc, dtype: float64

In [17]:
df_train.corr(numeric_only=True)['price_doc'].sort_values(ascending=False).to_frame()

Unnamed: 0,price_doc
price_doc,1.000000
full_sq,0.341840
num_room,0.335201
sport_count_5000,0.294864
sport_count_3000,0.290651
...,...
ttk_km,-0.272620
bulvar_ring_km,-0.279158
kremlin_km,-0.279249
sadovoe_km,-0.283622


In [18]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30471 entries, 0 to 30470
Columns: 454 entries, id to ecology_satisfactory
dtypes: bool(177), float64(119), int64(158)
memory usage: 69.5 MB


In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=2)

In [21]:
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [22]:
cat_model = CatBoostRegressor(n_estimators=1000, 
                              learning_rate= 0.02, 
                              max_depth=5,
                              verbose=False,
                              model_size_reg=0.7,
                              subsample=0.44,
                              random_strength=1.5,
                              loss_function='RMSE',
                              random_state=2)

cat_model.fit(X_train, y_train)
cat_pred = cat_model.predict(X_valid)
cat_score = rmse(y_valid, cat_pred)
cat_score

0.441503641474752

In [23]:
gbr_model = GradientBoostingRegressor(n_estimators=1000,
                                      learning_rate=0.02, 
                                      max_depth=5, 
                                      min_samples_split=2, 
                                      max_features=35, 
                                      random_state=2)

gbr_model.fit(X_train, y_train)
gbr_pred = gbr_model.predict(X_valid)
gbr_score = rmse(y_valid, gbr_pred)
gbr_score

0.4438888402741703

In [24]:
lgbm_model = LGBMRegressor(objective='regression',
                         num_leaves=4,
                         learning_rate=0.01,
                         n_estimators=5000,
                         max_bin=200,
                         bagging_fraction=0.75,
                         bagging_freq=5,
                         bagging_seed=7,
                         feature_fraction=0.2,
                         feature_fraction_seed=7,
                         verbose=-1)

                         
lgbm_model.fit(X_train, y_train)
lgbm_pred = lgbm_model.predict(X_valid)
lgbm_score = rmse(y_valid, lgbm_pred)
lgbm_score

0.44300771838510766

In [86]:
ln_model = LinearRegression()

ln_model.fit(X_train, y_train)
ln_pred = ln_model.predict(X_valid)
ln_score = rmse(y_valid, ln_pred)
ln_score

0.9073217369944856

In [87]:
dt_model = DecisionTreeRegressor(criterion='squared_error',
                                 max_depth=None, 
                                 min_samples_split=2, 
                                 min_samples_leaf=3,
                                 min_weight_fraction_leaf=0.0, 
                                 max_features=None, 
                                 splitter="best",
                                 random_state=2)
    
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_valid)
dt_score = rmse(y_valid, dt_pred)
dt_score

0.5970534506284373

In [88]:
xgb_model = XGBRegressor(n_estimators=2000,
                         learning_rate=0.01, 
                         max_depth=5, 
                         min_child_weight=1,
                         gamma=0, 
                         subsample=0.7,
                         colsample_bytree=0.7,
                         objective='reg:squarederror', 
                         nthread=-1,
                         scale_pos_weight=1, 
                         seed=27,
                         reg_alpha=0.00006)

xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_valid)
xgb_score = rmse(y_valid, xgb_pred)
xgb_score

0.4447666962977773

In [92]:
print("CAT :",cat_score,"\n",
      "GBR :",gbr_score,"\n",
      "XGB :",xgb_score,"\n",
      "LGBM :",lgbm_score,"\n", 
      "LN :", ln_score,"\n",
      "DT :",dt_score
      )

CAT : 0.441503641474752 
 GBR : 0.4438888402741703 
 XGB : 0.4447666962977773 
 LGBM : 0.44300771838510766 
 LN : 0.9073217369944856 
 DT : 0.5970534506284373


In [98]:
df_test.shape, df_train.shape

((7662, 453), (30471, 454))

In [None]:
df_test.loc[:, "price_doc"] = 0
df_test

In [105]:
cat_predictions = np.exp(cat_model.predict(df_test))

In [111]:
output['Id'] += 30474

In [113]:
output = pd.DataFrame({'Id': df_test.index, 'price_doc': cat_predictions})
output.to_csv('cat_submissionq.csv', index=False)
output.head()

Unnamed: 0,Id,price_doc
0,0,5333063.0
1,1,8372401.0
2,2,5317217.0
3,3,5879872.0
4,4,5230894.0


In [114]:
output['Id'] += 30474

In [115]:
output.head()

Unnamed: 0,Id,price_doc
0,30474,5333063.0
1,30475,8372401.0
2,30476,5317217.0
3,30477,5879872.0
4,30478,5230894.0


In [116]:
output.to_csv('cat_submissionq.csv', index=False)