In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [2]:
df = pd.read_csv("../dataset/train.csv")
y_train = df['price']
x_train = df.drop(columns=['price'])
x_train.shape, y_train.shape

((15035, 20), (15035,))

In [27]:
df_test = pd.read_csv("../dataset/test.csv")

### 기본 선형모델(feature를 추가하지 않은)
- feature drop
    - id, date, zipcode (drop)
- OLS

In [7]:
model_ols = sm.OLS.from_formula("np.log1p(price) ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + C(waterfront) \
                    + view + condition + grade + sqft_above + sqft_basement + yr_built \
                    + yr_renovated + lat + long + sqft_living15 + sqft_lot15 - 1", df)
result = model_ols.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log1p(price)   R-squared:                       0.769
Model:                            OLS   Adj. R-squared:                  0.769
Method:                 Least Squares   F-statistic:                     3131.
Date:                Tue, 02 Apr 2019   Prob (F-statistic):               0.00
Time:                        19:13:29   Log-Likelihood:                -689.49
No. Observations:               15035   AIC:                             1413.
Df Residuals:                   15018   BIC:                             1542.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
C(waterfront)[0]   -52.4150      2.389  

- price보다 np.log1p(price)를 해주는게 R square가 높다..
- condition number가 너무 크다.

1. std err : 표준오차(standard error) ; 
2. t : (w_hat - w) / std_err = t
3. p>|t| : "w가 0이다"라는 귀무가설의 유의확률; t의 유의확률
4. F statistics : 


---
### EDA를 통해 사용해볼 features
- 47.5~47.8도 사이에 위치한 zipcode --> 중심지와 교외지역
- 부지면적(sqft_lot) 대비 거주면적(sqft_living) 비율
- 거주면적(sqft_living) 대비 지상면적(sqft_above) 비율
- 재건축 여부 (yr_renovated)
- 선거연도에 지어졌는지 유무(election_year)

In [3]:
# 먼저 독립변수들간의 상관관계를 조금 줄여보자
# sqft_above, sqft_basement drop
model_ols = sm.OLS.from_formula("np.log1p(price) ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + C(waterfront) \
                    + view + condition + grade + yr_built \
                    + yr_renovated + lat + long + sqft_living15 + sqft_lot15 - 1", df)
result = model_ols.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log1p(price)   R-squared:                       0.769
Model:                            OLS   Adj. R-squared:                  0.769
Method:                 Least Squares   F-statistic:                     3337.
Date:                Fri, 05 Apr 2019   Prob (F-statistic):               0.00
Time:                        22:33:55   Log-Likelihood:                -693.64
No. Observations:               15035   AIC:                             1419.
Df Residuals:                   15019   BIC:                             1541.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
C(waterfront)[0]   -53.9802      2.327  

- condition number가 많이 줄었다
- scaling을 해주자

In [5]:
model_ols = sm.OLS.from_formula("np.log1p(price) ~ bedrooms + bathrooms + scale(sqft_living) + scale(sqft_lot) + floors + C(waterfront) \
                    + view + condition + grade + scale(lat) + scale(long) + scale(sqft_living15) + scale(sqft_lot15) - 1", df)
result = model_ols.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log1p(price)   R-squared:                       0.752
Model:                            OLS   Adj. R-squared:                  0.751
Method:                 Least Squares   F-statistic:                     3496.
Date:                Fri, 05 Apr 2019   Prob (F-statistic):               0.00
Time:                        22:37:40   Log-Likelihood:                -1246.8
No. Observations:               15035   AIC:                             2522.
Df Residuals:                   15021   BIC:                             2628.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
C(waterfront)[0]        11.5416 

- lat/long까지 scaling해줬더니 conditional number가 급감하였다.

In [3]:
df2 = df.copy()

In [4]:
# 평당가격 (실제론 피트당 가격)
# sqft_living
df2['per_price'] = df2['price'] / df2['sqft_living']
price_per_zipcode = df2.groupby(['zipcode'])['per_price'].agg({'zipprice_mean' : 'mean', 'zipprice_std' : np.std}).reset_index()
price_per_zipcode.tail()

price_per_zipcode['zipprice_mean'].describe()

# 평당가격이 317 이상되면 중심부가 아닐까?

# merge df2 and price_per_zipcode
df2 = df2.merge(price_per_zipcode, how='left', on='zipcode')
df2.tail()

is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,per_price,zipprice_mean,zipprice_std
15030,15030,20141014T000000,610685.0,4,2.5,2520,6023,2.0,0,0,...,2014,0,98056,47.5137,-122.167,2520,6023,242.335317,215.244097,64.696581
15031,15031,20150326T000000,1007500.0,4,3.5,3510,7200,2.0,0,0,...,2009,0,98136,47.5537,-122.398,2050,6200,287.037037,336.554976,96.87564
15032,15032,20140521T000000,360000.0,3,2.5,1530,1131,3.0,0,0,...,2009,0,98103,47.6993,-122.346,1530,1509,235.294118,371.46015,97.427992
15033,15033,20150223T000000,400000.0,4,2.5,2310,5813,2.0,0,0,...,2014,0,98146,47.5107,-122.362,1830,7200,173.160173,223.377191,85.597885
15034,15034,20141015T000000,325000.0,2,0.75,1020,1076,2.0,0,0,...,2008,0,98144,47.5941,-122.299,1020,1357,318.627451,319.292642,91.391157


In [5]:
idx = df2[(df2.zipprice_mean > 317.) & (df2.lat >= 47.5) & (df2.lat < 47.8)].index
df2['center_region'] = 0
df2.loc[idx, 'center_region'] = 1
df2.center_region

0        0
1        0
2        0
3        0
4        0
5        0
6        1
7        0
8        0
9        1
10       0
11       0
12       1
13       0
14       0
15       0
16       1
17       0
18       0
19       0
20       1
21       1
22       0
23       1
24       1
25       0
26       1
27       1
28       0
29       0
        ..
15005    0
15006    0
15007    0
15008    1
15009    0
15010    1
15011    1
15012    1
15013    0
15014    1
15015    1
15016    0
15017    1
15018    0
15019    1
15020    0
15021    0
15022    0
15023    0
15024    0
15025    1
15026    0
15027    1
15028    0
15029    0
15030    0
15031    1
15032    1
15033    0
15034    1
Name: center_region, Length: 15035, dtype: int64

In [6]:
model_ols = sm.OLS.from_formula("np.log1p(price) ~ bedrooms + bathrooms + scale(sqft_living) + scale(sqft_lot) + floors + C(waterfront) \
                    + view + condition + grade + scale(lat) + scale(long) + scale(sqft_living15) + scale(sqft_lot15) + C(center_region) - 1", df2)
result = model_ols.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log1p(price)   R-squared:                       0.826
Model:                            OLS   Adj. R-squared:                  0.826
Method:                 Least Squares   F-statistic:                     5106.
Date:                Tue, 09 Apr 2019   Prob (F-statistic):               0.00
Time:                        19:46:04   Log-Likelihood:                 1445.4
No. Observations:               15035   AIC:                            -2861.
Df Residuals:                   15020   BIC:                            -2747.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
C(waterfront)[0]         11.71

- conditional number는 큰 변동없음.(25이하까지 떨어뜨려야하는 것으로 알고 있음.)
- R square 값이 올랐다 (0.75 --> 0.82)

In [9]:
df2['is_renovated'] = df2['yr_renovated'].map(lambda x : 0 if x == 0 else 1)
df2['is_election_year'] = df2['yr_built'].map(lambda x : 1 if x % 4 == 0 else 0)

In [18]:
model_ols = sm.OLS.from_formula("np.log1p(price) ~ bedrooms + bathrooms + scale(sqft_living) + scale(sqft_lot) + floors + C(waterfront) \
                    + I(view / condition) + grade + scale(lat) + scale(long) + scale(sqft_living15) \
                    + scale(sqft_lot15) + C(center_region) + C(is_renovated) + C(is_election_year) - 1", df2)
result = model_ols.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log1p(price)   R-squared:                       0.819
Model:                            OLS   Adj. R-squared:                  0.819
Method:                 Least Squares   F-statistic:                     4531.
Date:                Tue, 09 Apr 2019   Prob (F-statistic):               0.00
Time:                        19:54:23   Log-Likelihood:                 1134.3
No. Observations:               15035   AIC:                            -2237.
Df Residuals:                   15019   BIC:                            -2115.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
C(waterfront)[0]        

In [24]:
model_ols = sm.OLS.from_formula("np.log1p(price) ~ bedrooms + bathrooms + scale(sqft_living) + I(scale(sqft_living / sqft_living15)) + scale(sqft_lot) + floors + C(waterfront) \
                    + I(view / condition) + grade + scale(lat) + scale(long) \
                    + scale(sqft_lot15) + C(center_region) + C(is_renovated) + C(is_election_year) - 1", df2)
result = model_ols.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log1p(price)   R-squared:                       0.815
Model:                            OLS   Adj. R-squared:                  0.814
Method:                 Least Squares   F-statistic:                     4401.
Date:                Tue, 09 Apr 2019   Prob (F-statistic):               0.00
Time:                        19:58:42   Log-Likelihood:                 955.22
No. Observations:               15035   AIC:                            -1878.
Df Residuals:                   15019   BIC:                            -1757.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

In [26]:
# data preprocessing
# test도 형태를 맞춰준다.
df_train = df2.drop(columns=['id','date',''])

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'per_price',
       'zipprice_mean', 'zipprice_std', 'center_region', 'is_renovated',
       'is_election_year'],
      dtype='object')

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import RidgeCV

param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.015,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4950}

y_reg = df_train['price']

#prepare fit model with cross-validation
folds = KFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

#run model
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train)):
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][train_columns], label=y_reg.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][train_columns], label=y_reg.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    #feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    #predictions
    predictions += clf.predict(df_test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
cv = np.sqrt(mean_squared_error(oof, y_reg))
print(cv)