In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



House Prices : Regression  - written by Miky   

1. Data Analysis

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


house_df_org = pd.read_csv('/kaggle/input/aiffel-ds-3-house-prices/train.csv')
hs_df = house_df_org.copy()
hs_df.head()


In [None]:
hs_df.info()

In [None]:
hs_df.describe() 

2. Data Preprocessing

In [None]:
# Null Data 

print('Dataset Shape:',hs_df.shape)

print('\n Featrues Types \n',hs_df.dtypes.value_counts())

isnull_series = hs_df.isnull().sum()
print("\nNull Coulumn's Counts:\n", isnull_series[isnull_series>0].sort_values(ascending=False))

In [None]:
hs_df.drop(['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu'], axis=1, inplace=True)

In [None]:
# Outliers of Feature Data

plt.scatter(x=hs_df['GrLivArea'], y = hs_df['SalePrice'])
plt.ylabel('SalePrice',fontsize=15)
plt.xlabel('GrLivArea',fontsize=15)
plt.show()

In [None]:
cond1 = hs_df['GrLivArea'] > 4000
cond2 = hs_df['SalePrice'] < 500000
outlier_index = hs_df[cond1&cond2].index

print('Outliers Record Index:', outlier_index.values)
print('Before Dropping Outliers, hs_df shape:', hs_df.shape)

In [None]:
hs_df.drop(outlier_index, axis=0, inplace=True)
print('After Dropping Outliers, hs_df shape',hs_df.shape )

In [None]:
# Target Data 분포도 시각화 분석 --> 정규분포 변환 필요 

plt.title('Original Sale Price Historgram')
plt.xticks(rotation=45)
sns.histplot(hs_df['SalePrice'], kde=True)
plt.show


In [None]:
# Log Transformation Visualization --> np.log1p() / expm1() 

plt.title('Log Transformed Sale Price Historgram')
log_SalePrice = np.log1p(hs_df['SalePrice'])
sns.histplot(log_SalePrice, kde=True)
plt.show()

In [None]:
# Log Transformation of Target Dataset

original_SalePrice = hs_df['SalePrice']
hs_df['SalePrice'] = np.log1p(hs_df['SalePrice'])


In [None]:
# Log Transformation of Feature Dataset 

from scipy.stats import skew

features_index = hs_df.dtypes[hs_df.dtypes != 'object'].index
skew_features = hs_df[features_index].apply(lambda x : skew(x))
skew_features_top = skew_features[skew_features>1]
print(skew_features_top.sort_values(ascending=False))

In [None]:
hs_df[skew_features_top.index]= np.log1p(hs_df[skew_features_top.index])

In [None]:
# One hot Encoding 

hs_df_ohe = pd.get_dummies(hs_df)

In [None]:
# Fill Null Data

hs_df_ohe.fillna(hs_df_ohe.mean(), inplace=True)

In [None]:
null_column_count = hs_df_ohe.isnull().sum()[hs_df_ohe.isnull().sum()>0]

print("## Null Feature Type :\n", hs_df_ohe.dtypes[null_column_count.index])

In [None]:
3. Linear(Ridge/Lasso) Model Fit/Predict/Evaluation

In [None]:
# Defining RMSE Evaluation Function  

def get_rmse(model): 
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    print(model.__class__.__name__, 'Log Transfromed RMSE:', np.round(rmse,3))
    return rmse

def get_rmses(models): 
    rmses=[]
    for model in models :
        rmse = get_rmse(model)
        rmses.append(rmse)
    return rmses 

In [None]:
# Linear Regression Model Fit/Predict/Evaluation

from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

y_target = hs_df_ohe['SalePrice']
X_features = hs_df_ohe.drop('SalePrice', axis=1, inplace=False)
X_train,X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=156)

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso()
lasso_reg.fit(X_train, y_train)

models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)

In [None]:
# Hyperparameter Optimization

from sklearn.model_selection import GridSearchCV

def print_best_params(model, params):
    grid_model = GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_features, y_target)
    rmse = np.sqrt(-1*grid_model.best_score_)
    print('{0} 5CV Optimalized RMSE:{1}, Optimalized alpha:{2}'.format(model.__class__.__name__, np.round(rmse, 4), grid_model.best_params_))

ridge_params = {'alpha': [0.05, 0.1, 1, 5, 8, 10, 12,15, 20]}
lasso_params = {'alpha':[0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1, 5, 10]}

print_best_params(ridge_reg, ridge_params)
print_best_params(lasso_reg, lasso_params)

In [None]:
# Modeling with Optimized Hyperparameters 

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge(alpha=15)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)

4. Tree Model Fit/Predic/Evaluation

In [None]:
from xgboost import XGBRegressor

xgb_params = {'n_estimators':[1000]}
xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05, colsample_bytree=0.5, subsample=0.8)
print_best_params(xgb_reg, xgb_params)

In [None]:
xgb_reg.fit(X_train, y_train)
xgb_pred =xgb_reg.predict(X_test)

model = xgb_reg
get_rmse(model)

In [None]:
from lightgbm import LGBMRegressor

lgbm_params = {'n_estimators':[1000]}
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.5, num_leaves=4, subsample=0.6, colsample_bytree=0.4, reg_lamda=10, n_jobs=1)

print_best_params(lgbm_reg, lgbm_params)

In [None]:
lgbm_reg.fit(X_train, y_train)
lgbm_pred =lgbm_reg.predict(X_test)

model = lgbm_reg
get_rmse(model)

5. Model Mixing through Ensemble : Final Prediction with Lasso & Ridge Models

In [None]:
def get_rmse_pred(preds):
    for key in preds.keys():
        pred_value = preds[key]
        mse = mean_squared_error(y_test, pred_value)
        rmse = np.sqrt(mse)
        print("{0} Model's RMSE:{1}".format(key,rmse))

In [None]:
ridge_reg = Ridge(alpha=15)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

ridge_pred = ridge_reg.predict(X_test)
lasso_pred= lasso_reg.predict(X_test)

In [None]:
pred = 0.4*ridge_pred + 0.6*lasso_pred
preds ={'Final Mixed':pred,
       'Ridge': ridge_pred,
       'Lasso':lasso_pred}

get_rmse_pred(preds) 

6. Model Mixing through Ensemble : Final Prediction with XGBM & LGBM Models

In [None]:
xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05, colsample_bytree=0.5, subsample=0.8)
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.5, num_leaves=4, subsample=0.6, colsample_bytree=0.4, reg_lamda=10, n_jobs=1)

xgb_reg.fit(X_train, y_train)
xgb_pred =xgb_reg.predict(X_test)

lgbm_reg.fit(X_train, y_train)
lgbm_pred =lgbm_reg.predict(X_test)

pred = 0.6*xgb_pred + 0.4*lgbm_pred
preds ={'Final Mixed':lgbm_pred,
       'XGBM': xgb_pred,
       'LGBM':lgbm_pred}

get_rmse_pred(preds) 


Final Prediction Result 

** **Mixed Model with Ridge & Lasso Models :0.11554510823948153******


In [None]:
submission = pd.DataFrame({'id': test['id'], 'Class': y_pred})
submission.to_csv('submission.csv', index=False)

submission