In [1]:
import pandas_profiling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBClassifier
from hyperopt import STATUS_OK

def scaler_transform_func(var):
    scaler = StandardScaler()
    scaled_var = scaler.fit_transform(var)
    return scaled_var

def log_transform_func(var):
    log_val = np.log1p(var)
    return log_val

def get_model_predict(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    
    print("###",model.__class__.__name__,"###")
    evaluate_regr(y_test,pred)
    
    def rmsle(y,pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

def rmse(y,pred):
    return np.sqrt(mean_squared_error(y,pred))

def evaluate_regr(y,pred):
    rmsle_val = rmsle(y,pred)
    rmse_val = rmse(y,pred)
    
    mae_val = mean_absolute_error(y,pred)
    print("RMSLE: {0:.3f}, RMSE: {1:.3f}, MAE: {2:.3f}".format(rmsle_val,rmse_val,mae_val))
    
def visualize_coefficient(models):
    # 3개 회귀 모델의 시각화를 위해 3개의 컬럼을 가지는 subplot 생성
    fig, axs = plt.subplots(figsize=(24,10),nrows=1, ncols=3)
    fig.tight_layout() 
    # 입력인자로 받은 list객체인 models에서 차례로 model을 추출하여 회귀 계수 시각화. 
    for i_num, model in enumerate(models):
        # 상위 10개, 하위 10개 회귀 계수를 구하고, 이를 판다스 concat으로 결합. 
        coef_high, coef_low = get_top_bottom_coef(model)
        coef_concat = pd.concat( [coef_high , coef_low] )
        # 순차적으로 ax subplot에 barchar로 표현. 한 화면에 표현하기 위해 tick label 위치와 font 크기 조정. 
        axs[i_num].set_title(model.__class__.__name__+' Coeffiecents', size=25)
        axs[i_num].tick_params(axis="y",direction="in", pad=-120)
        for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
            label.set_fontsize(22)
        sns.barplot(x=coef_concat.values, y=coef_concat.index , ax=axs[i_num])
        
def get_top_bottom_coef(model):
    # coef_ 속성을 기반으로 Series 객체를 생성. index는 컬럼명. 
    coef = pd.Series(model.coef_, index=X_features.columns)
    
    # + 상위 10개 , - 하위 10개 coefficient 추출하여 반환.
    coef_high = coef.sort_values(ascending=False).head(10)
    coef_low = coef.sort_values(ascending=False).tail(10)
    return coef_high, coef_low

def get_avg_rmse_cv(models):
    for model in models:
        # 분할하지 않고 전체 데이터로 cross_val_score( ) 수행. 모델별 CV RMSE값과 평균 RMSE 출력
        rmse_list = np.sqrt(-cross_val_score(model, X_features, y_target,
                                             scoring="neg_mean_squared_error", cv = 5))
        rmse_avg = np.mean(rmse_list)
        print('\n{0} CV RMSE 값 리스트: {1}'.format( model.__class__.__name__, np.round(rmse_list, 3)))
        print('{0} CV 평균 RMSE 값: {1}'.format( model.__class__.__name__, np.round(rmse_avg, 3)))
        
def print_best_params(model, params):
    grid_model = GridSearchCV(model, param_grid=params, 
                              scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_features, y_target)
    rmse = np.sqrt(-1* grid_model.best_score_)
    print('{0} 5 CV 시 최적 평균 RMSE 값: {1}, 최적 alpha:{2}'.format(model.__class__.__name__,
                                        np.round(rmse, 4), grid_model.best_params_))
    return grid_model.best_estimator_

def preprocessing(df):
    house_df.drop(['Id','PoolQC','Fence','MiscFeature','Alley','MasVnrType'],axis=1,inplace=True)
    
    house_df['LotFrontage'].fillna(house_df['LotFrontage'].mean(),inplace=True)
    house_df['MasVnrArea'].fillna(house_df['MasVnrArea'].mean(),inplace=True)
    house_df['GarageYrBlt'].fillna(house_df['GarageYrBlt'].mean(),inplace=True)
    
    house_df['BsmtQual'].fillna('TA',inplace=True)
    house_df['BsmtCond'].fillna('TA',inplace=True)
    house_df['BsmtExposure'].fillna('No',inplace=True)
    house_df['BsmtFinType1'].fillna('Unf',inplace=True)
    house_df['BsmtFinType2'].fillna('Unf',inplace=True)
    house_df['Electrical'].fillna('SBrkr',inplace=True)
    house_df['FireplaceQu'].fillna('TA',inplace=True)
    house_df['GarageType'].fillna('Attchd',inplace=True)
    house_df['GarageFinish'].fillna('Unf',inplace=True)
    house_df['GarageQual'].fillna('TA',inplace=True)
    house_df['GarageCond'].fillna('TA',inplace=True)

def ohe_func(df):
    category_col = []

    for col_name in house_df.columns:
        if house_df[col_name].dtype == 'object':
            category_col.append(col_name)
    category_col.append('YearBuilt')
    category_col.append('YearRemodAdd')
    df_ohe = pd.get_dummies(df,columns=category_col)
    return df_ohe

IndentationError: expected an indented block (Temp/ipykernel_23548/4202384954.py, line 34)