In [267]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager, rc
from sklearn.preprocessing import StandardScaler

font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

plt.rcParams['font.family'] = 'Malgun Gothic'

import warnings

warnings.filterwarnings("ignore")

In [268]:
def data_preprocessing(train, test):
    
    error_data = ['C2085', 'C1397', 'C2431', 'C1649', 'C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']

    for error in error_data :
        train = train[train['단지코드'] != error]

#     train = train[train.단지코드 != 'C1804']
#     train = train[train.단지코드 != 'C2405']
#     train = train[train.단지코드 != 'C1740']
#     train = train[train.단지코드 != 'C1206']

    train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
    test.loc[test.임대보증금=='-', '임대보증금'] = np.nan
    train['임대보증금'] = train['임대보증금'].astype(float)
    test['임대보증금'] = test['임대보증금'].astype(float)
    
    train['총세대수'] = train['총세대수'].astype(float)
    test['총세대수'] = test['총세대수'].astype(float)

    train.loc[train.임대료=='-', '임대료'] = np.nan
    test.loc[test.임대료=='-', '임대료'] = np.nan
    train['임대료'] = train['임대료'].astype(float)
    test['임대료'] = test['임대료'].astype(float)

    train[['임대보증금', '임대료']] = train[['임대보증금', '임대료']].fillna(0)
    test[['임대보증금', '임대료']] = test[['임대보증금', '임대료']].fillna(0)

    cols = ['도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수']
    train[cols] = train[cols].fillna(0)
    test[cols] = test[cols].fillna(0)
    
    test.loc[test.단지코드.isin(['C2411']) & test.자격유형.isnull(), '자격유형'] = 'A'
    test.loc[test.단지코드.isin(['C2253']) & test.자격유형.isnull(), '자격유형'] = 'C'
    
    train = train.drop_duplicates()
    test = test.drop_duplicates()
    
    
    
    return train, test

In [269]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
age_gender = pd.read_csv('age_gender_info.csv')
train_money = pd.read_csv('train_final_1004_ver3.csv')
test_money = pd.read_csv('test_final_1004_ver3.csv')

In [270]:
train, test = data_preprocessing(train,test)

In [271]:
train = train.merge(age_gender, left_on= ["지역"], right_on= ["지역"], how='left')
test = test.merge(age_gender, left_on= ["지역"], right_on= ["지역"], how='left')
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수', '10대미만(여자)', '10대미만(남자)',
       '10대(여자)', '10대(남자)', '20대(여자)', '20대(남자)', '30대(여자)', '30대(남자)',
       '40대(여자)', '40대(남자)', '50대(여자)', '50대(남자)', '60대(여자)', '60대(남자)',
       '70대(여자)', '70대(남자)', '80대(여자)', '80대(남자)', '90대(여자)', '90대(남자)',
       '100대(여자)', '100대(남자)'],
      dtype='object')

In [272]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,60대(여자),60대(남자),70대(여자),70대(남자),80대(여자),80대(남자),90대(여자),90대(남자),100대(여자),100대(남자)
0,C2515,545.0,아파트,경상남도,국민임대,33.48,276,17.0,A,9216000.0,...,0.087201,0.069562,0.048357,0.033277,0.027361,0.011295,0.00491,0.001086,0.000179,1e-05
1,C2515,545.0,아파트,경상남도,국민임대,39.6,60,17.0,A,12672000.0,...,0.087201,0.069562,0.048357,0.033277,0.027361,0.011295,0.00491,0.001086,0.000179,1e-05
2,C2515,545.0,아파트,경상남도,국민임대,39.6,20,17.0,A,12672000.0,...,0.087201,0.069562,0.048357,0.033277,0.027361,0.011295,0.00491,0.001086,0.000179,1e-05
3,C2515,545.0,아파트,경상남도,국민임대,46.9,38,17.0,A,18433000.0,...,0.087201,0.069562,0.048357,0.033277,0.027361,0.011295,0.00491,0.001086,0.000179,1e-05
4,C2515,545.0,아파트,경상남도,국민임대,46.9,19,17.0,A,18433000.0,...,0.087201,0.069562,0.048357,0.033277,0.027361,0.011295,0.00491,0.001086,0.000179,1e-05


In [273]:
# train['미성년자'] = train[['10대미만(여자)', '10대미만(남자)', '10대(여자)', '10대(남자)']].sum(axis=1)*train['총세대수']
train['20대'] = train[['20대(여자)', '20대(남자)']].sum(axis=1)*train['총세대수']
train['30대'] = train[['30대(여자)', '30대(남자)']].sum(axis=1)*train['총세대수']
train['40_50대'] = train[['40대(여자)', '40대(남자)','50대(여자)', '50대(남자)']].sum(axis=1)*train['총세대수']
train['고령층'] = train[['70대(여자)', '70대(남자)', 
                        '80대(여자)','80대(남자)', '90대(여자)', '90대(남자)', 
                              '100대(여자)', '100대(남자)']].sum(axis=1)*train['총세대수']
# train['성인여성'] = train[['20대(여자)', '30대(여자)', 
#                      '40대(여자)',  '50대(여자)','60대(여자)' 
#                       ]] .sum(axis=1)*train['총세대수']
# train['성인남성'] = train[[
#                       '20대(남자)', '30대(남자)', 
#                       '40대(남자)',  '50대(남자)','60대(남자)'
#                      ]].sum(axis=1)*train['총세대수']

# test['미성년자'] = test[['10대미만(여자)', '10대미만(남자)', '10대(여자)', '10대(남자)']].sum(axis=1)*test['총세대수']
test['20대'] = test[['20대(여자)', '20대(남자)']].sum(axis=1)*test['총세대수']
test['30대'] = test[['30대(여자)', '30대(남자)']].sum(axis=1)*test['총세대수']
test['40_50대'] = test[['40대(여자)', '40대(남자)','50대(여자)', '50대(남자)']].sum(axis=1)*train['총세대수']
test['고령층'] = test[['70대(여자)', '70대(남자)', 
                        '80대(여자)','80대(남자)', '90대(여자)', '90대(남자)', 
                              '100대(여자)', '100대(남자)']].sum(axis=1)*test['총세대수']
# test['성인여성'] = test[['20대(여자)', '30대(여자)', 
#                      '40대(여자)',  '50대(여자)' 
#                       ]] .sum(axis=1)*train['총세대수']
# test['성인남성'] = test[[
#                       '20대(남자)', '30대(남자)', 
#                       '40대(남자)',  '50대(남자)'
#                      ]].sum(axis=1)*train['총세대수']
train

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,80대(여자),80대(남자),90대(여자),90대(남자),100대(여자),100대(남자),20대,30대,40_50대,고령층
0,C2515,545.0,아파트,경상남도,국민임대,33.48,276,17.0,A,9216000.0,...,0.027361,0.011295,0.004910,0.001086,0.000179,0.000010,73.868127,62.110458,165.752739,68.928386
1,C2515,545.0,아파트,경상남도,국민임대,39.60,60,17.0,A,12672000.0,...,0.027361,0.011295,0.004910,0.001086,0.000179,0.000010,73.868127,62.110458,165.752739,68.928386
2,C2515,545.0,아파트,경상남도,국민임대,39.60,20,17.0,A,12672000.0,...,0.027361,0.011295,0.004910,0.001086,0.000179,0.000010,73.868127,62.110458,165.752739,68.928386
3,C2515,545.0,아파트,경상남도,국민임대,46.90,38,17.0,A,18433000.0,...,0.027361,0.011295,0.004910,0.001086,0.000179,0.000010,73.868127,62.110458,165.752739,68.928386
4,C2515,545.0,아파트,경상남도,국민임대,46.90,19,17.0,A,18433000.0,...,0.027361,0.011295,0.004910,0.001086,0.000179,0.000010,73.868127,62.110458,165.752739,68.928386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551,C2532,239.0,아파트,강원도,국민임대,49.20,19,7.0,A,11346000.0,...,0.033515,0.013027,0.007628,0.001677,0.000319,0.000017,29.523883,27.279940,72.571532,32.665404
2552,C2532,239.0,아파트,강원도,국민임대,51.08,34,7.0,A,14005000.0,...,0.033515,0.013027,0.007628,0.001677,0.000319,0.000017,29.523883,27.279940,72.571532,32.665404
2553,C2532,239.0,아파트,강원도,국민임대,51.73,34,7.0,A,14005000.0,...,0.033515,0.013027,0.007628,0.001677,0.000319,0.000017,29.523883,27.279940,72.571532,32.665404
2554,C2532,239.0,아파트,강원도,국민임대,51.96,114,7.0,A,14005000.0,...,0.033515,0.013027,0.007628,0.001677,0.000319,0.000017,29.523883,27.279940,72.571532,32.665404


In [274]:
train.drop(['10대미만(여자)', '10대미만(남자)',
       '10대(여자)', '10대(남자)', '20대(여자)', '20대(남자)', '30대(여자)', '30대(남자)',
       '40대(여자)', '40대(남자)', '50대(여자)', '50대(남자)', '60대(여자)', '60대(남자)',
       '70대(여자)', '70대(남자)', '80대(여자)', '80대(남자)', '90대(여자)', '90대(남자)',
       '100대(여자)', '100대(남자)'],axis=1,inplace=True)

In [275]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수,20대,30대,40_50대,고령층
0,C2515,545.0,아파트,경상남도,국민임대,33.48,276,17.0,A,9216000.0,82940.0,0.0,3.0,624.0,205.0,73.868127,62.110458,165.752739,68.928386
1,C2515,545.0,아파트,경상남도,국민임대,39.6,60,17.0,A,12672000.0,107130.0,0.0,3.0,624.0,205.0,73.868127,62.110458,165.752739,68.928386
2,C2515,545.0,아파트,경상남도,국민임대,39.6,20,17.0,A,12672000.0,107130.0,0.0,3.0,624.0,205.0,73.868127,62.110458,165.752739,68.928386
3,C2515,545.0,아파트,경상남도,국민임대,46.9,38,17.0,A,18433000.0,149760.0,0.0,3.0,624.0,205.0,73.868127,62.110458,165.752739,68.928386
4,C2515,545.0,아파트,경상남도,국민임대,46.9,19,17.0,A,18433000.0,149760.0,0.0,3.0,624.0,205.0,73.868127,62.110458,165.752739,68.928386


In [277]:
# train['성인남성'] = train['성인남성'] * 1.3

In [278]:
train['임대료'] = train_money['임대료'].values
train['임대보증금'] = train_money['임대보증금'].values
test['임대료'] = test_money['임대료'].values
test['임대보증금'] = test_money['임대보증금'].values

In [279]:
area_mean = train['전용면적'].mean()

In [280]:
train['면적기준이상'] = np.where(train['전용면적']>=area_mean,1,0)
test['면적기준이상'] = np.where(test['전용면적']>=area_mean,1,0)

In [281]:
train = train[train.단지코드 != 'C1804']
train = train[train.단지코드 != 'C2405']
train = train[train.단지코드 != 'C1740']
train = train[train.단지코드 != 'C1206']

In [283]:
unique_cols = ['총세대수', '지역', '공가수', '임대보증금','임대료',
           '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '면적기준이상',
           '도보 10분거리 내 버스정류장 수',
           '단지내주차면수', '등록차량수','단지코드','20대','30대','40_50대','고령층']


index_arr = []
for r in train.values:
    index_arr.append(r[0] +"_"+ str(r[5]))
    
train['단지_면적_코드'] = index_arr
train_agg = train.set_index('단지_면적_코드')[unique_cols].drop_duplicates()


index_arr2 = []
for r in test.values:
    index_arr2.append(r[0] +"_"+ str(r[5]))

test['단지_면적_코드'] = index_arr2
test_agg = test.set_index('단지_면적_코드')[[col for col in unique_cols if col!='등록차량수']].drop_duplicates()


In [284]:
train.loc[train.공급유형.isin(['공공임대(5년)', '공공분양', '공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(5년/10년/분납/분양)'
test.loc[test.공급유형.isin(['공공임대(5년)', '공공분양', '공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(5년/10년/분납/분양)'
train.loc[train.공급유형.isin(['장기전세', '국민임대']), '공급유형'] = '국민임대/장기전세'
test.loc[test.공급유형.isin(['장기전세', '국민임대']), '공급유형'] = '국민임대/장기전세'

train.loc[train.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'
test.loc[test.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'

train.loc[train.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대/장기전세_공급대상'
test.loc[test.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대/장기전세_공급대상'

train.loc[train.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'
test.loc[test.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'


In [285]:
def reshape_cat_features(data, cast_col, value_col):
        res = data.drop_duplicates(["단지_면적_코드", cast_col]).assign(counter=1).pivot(index='단지_면적_코드', columns=cast_col, values=value_col).fillna(0)
        res.columns.name = None
        res = res.rename(columns={col:cast_col+'_'+str(col) for col in res.columns})
        return res

    
    

In [286]:
area_person = train.groupby('단지_면적_코드').sum()['전용면적별세대수']
area_person
# res.columns.name = None
# res = res.rename(columns={col:cast_col+'_'+str(col) for col in res.columns})
area_person_test = test.groupby('단지_면적_코드').sum()['전용면적별세대수']

In [287]:
train_agg = pd.merge(train_agg, reshape_cat_features(data=train, cast_col='임대건물구분', value_col='counter'), left_index=True, right_index=True)
train_agg = pd.merge(train_agg, reshape_cat_features(data=train, cast_col='공급유형', value_col='counter'), left_index=True, right_index=True)
train_agg = pd.merge(train_agg, area_person,left_index=True, right_index=True)
X_train = pd.merge(train_agg, reshape_cat_features(data=train, cast_col='자격유형', value_col='counter'), left_index=True, right_index=True)

test_agg = pd.merge(test_agg, reshape_cat_features(data=test, cast_col='임대건물구분', value_col='counter'), left_index=True, right_index=True)
test_agg = pd.merge(test_agg, reshape_cat_features(data=test, cast_col='공급유형', value_col='counter'), left_index=True, right_index=True)
test_agg = pd.merge(test_agg, area_person_test,left_index=True, right_index=True)
X_test = pd.merge(test_agg, reshape_cat_features(data=test, cast_col='자격유형', value_col='counter'), left_index=True, right_index=True)


# X_train = pd.concat([train_agg,
#                        reshape_cat_features(data=train, cast_col='임대건물구분', value_col='counter'),
#                        reshape_cat_features(data=train, cast_col='공급유형', value_col='counter'),
#                        reshape_cat_features(data=train, cast_col='자격유형', value_col='counter'),
#                        ], axis=1)

# X_test = pd.concat([test_agg,
#                        reshape_cat_features(data=test, cast_col='임대건물구분', value_col='counter'),
#                        reshape_cat_features(data=test, cast_col='공급유형', value_col='counter'),
#                        reshape_cat_features(data=test, cast_col='자격유형', value_col='counter'),
#                   ], axis=1)

In [288]:
region_regi_car = train.groupby(['지역'])['등록차량수','총세대수'].sum()
region_regi_car['지역_세대당차량수'] = region_regi_car['등록차량수'] / region_regi_car['총세대수']
region_mapping = region_regi_car.drop(['등록차량수','총세대수'],axis=1)


mapping_values = []
for r in X_train['지역'].values:
    mapping_values.extend([region_mapping.loc[r].values])
region_car = pd.DataFrame(mapping_values, columns = ['지역_세대당차량수'], index = X_train.index)
X_train = pd.concat([X_train,region_car], axis= 1).drop(columns = ['지역'])

mapping_values = []
for r in X_test['지역'].values:
    mapping_values.extend([region_mapping.loc[r].values])
region_car = pd.DataFrame(mapping_values, columns = ['지역_세대당차량수'], index = X_test.index)
X_test = pd.concat([X_test,region_car], axis= 1).drop(columns = ['지역'])

In [289]:
from sklearn.preprocessing import StandardScaler
scailing_features = ['총세대수',
                     '공가수',
                     '도보 10분거리 내 버스정류장 수',
                     '임대보증금',
                     '임대료',
                     '단지내주차면수',
                     '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
                     '지역_세대당차량수',
                     '전용면적별세대수',
                     '20대','30대','40_50대',
                     '고령층'
                     
                    ]


# scaler = StandardScaler()
# X_train.loc[:, scailing_features] = scaler.fit_transform(X_train[scailing_features])
# X_test.loc[:, scailing_features] = scaler.transform(X_test[scailing_features])

X_train.loc[:,scailing_features] = np.log1p(X_train[scailing_features])
X_test.loc[:,scailing_features] = np.log1p(X_test[scailing_features])

In [290]:
test_code = X_test['단지코드']

In [291]:
X = X_train.drop(columns = ['등록차량수','단지코드'], axis=1)
y = np.log1p(X_train['등록차량수'])
X_test = X_test.drop(['단지코드'], axis=1)

In [292]:
from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

def kfold_val(n, model, X_data, y_target,expm1):
    kfold = KFold(n_splits=n)
    cv_mae=[]

    n_iter = 0 

    for train_index, test_index in kfold.split(X_data):
        X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
        y_train, y_test = y_target[train_index], y_target[test_index]
        # 학습 및 예측
        

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        n_iter += 1

        if expm1 :
            y_t = np.expm1(y_test)
            y_p = np.expm1(y_pred)
        else:
            y_t = y_test
            y_p = y_pred

        a = 0
        mae = mean_absolute_error(y_t,y_p)
        train_size = X_train.shape[0]
        test_size = X_test.shape[0]
#         print('\n#{0} 교차 검증 MAE :{1}, 학습 데이터 크기: {2}, 검증 데이터 크기: {3}'.format(n_iter, mae, train_size, test_size))
        cv_mae.append(mae)
    # 개별 iteration별 정확도를 합하여 평균 정확도 계산
    print(model, '\n## 평균 검증 MAE:', np.mean(cv_mae))
    return np.mean(cv_mae)

In [293]:
from sklearn.model_selection import GridSearchCV

def get_best_params(model, params,X_data, y_target):
    grid_model = GridSearchCV(model, param_grid=params, 
                              scoring='neg_mean_absolute_error', cv=10)
    grid_model.fit(X_data, y_target)
    mae = -1 *  grid_model.best_score_
    print('{0} 5 CV 시 최적 평균 로그 변환된 MAE 값: {1}, 최적 alpha:{2}'.format(model.__class__.__name__,
                                        np.round(mae, 4), grid_model.best_params_))
    return grid_model.best_estimator_

In [294]:
from sklearn.preprocessing import PolynomialFeatures
poly_X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
poly_X = pd.DataFrame(poly_X)

poly_test = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X_test)
poly_test = pd.DataFrame(poly_test)

In [295]:
from statsmodels.stats.outliers_influence import OLSInfluence
import statsmodels.api as sm
y = list(y)
OLS = sm.OLS(y, poly_X)
result = OLS.fit()
influence = OLSInfluence(result)

weight = (influence.cooks_distance[0]).to_numpy()

for i in range(len(weight)):
    if weight[i] < 0.00000001:
        weight[i] = 0.9
    elif weight[i] < 0.0000001:
        weight[i] = 0.8
    elif weight[i] < 0.000001:
        weight[i] = 0.7
    elif weight[i] < 0.00001:
        weight[i] = 0.6
    elif weight[i] < 0.0001:
        weight[i] = 0.5
    elif weight[i] < 0.001:
        weight[i] = 0.4
    elif weight[i] < 0.01:
        weight[i] = 0.3
    elif weight[i] < 0.1:
        weight[i] = 0.2
    else:
        weight[i] = 0.1

In [296]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import StratifiedKFold
y = pd.Series(y)

y_cat = pd.cut(y, 10, labels=range(10))

skf = StratifiedKFold(5)

ridge_params = {'alpha' : [0.05,0.1,1,5,8,10,12,13,14,15,16,17,20,25]}
lasso_params = {'alpha' : [0.0005,0.001,0.005, 0.008,0.05, 0.03, 0.1, 0.5, 1.5,10]}
elastic_params = {'alpha' : [0.0005,0.001,0.005, 0.008,0.05, 0.03, 0.1, 0.5, 1.5,10]}

ridge_reg = Ridge()
lasso_reg = Lasso()
elastic_reg = ElasticNet()

best_ridge=get_best_params(ridge_reg,ridge_params,poly_X,y)
best_lasso = get_best_params(lasso_reg,lasso_params,poly_X,y)
best_elastic = get_best_params(elastic_reg,elastic_params,poly_X,y)

# lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=best_ridge.alpha)
lasso_reg = Lasso(alpha=best_lasso.alpha)
elastic_reg = ElasticNet(alpha=best_elastic.alpha)

cv_mae = []
models = [ridge_reg, lasso_reg,elastic_reg]
for model in models:
    cv_mae=[]
    for train_index, test_index in skf.split(poly_X, y_cat) :
        X_train, X_te = poly_X.iloc[train_index], poly_X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        weight_tr = weight[train_index]

        model.fit(X_train, y_train,sample_weight=weight_tr)
        y_pred = model.predict(X_te)
        y_t = np.expm1(y_test)
        y_p = np.expm1(y_pred)


        a = 0
        mae = mean_absolute_error(y_t,y_p)
        train_size = X_train.shape[0]
        test_size = X_te.shape[0]
        #         print('\n#{0} 교차 검증 MAE :{1}, 학습 데이터 크기: {2}, 검증 데이터 크기: {3}'.format(n_iter, mae, train_size, test_size))
        cv_mae.append(mae)
        # 개별 iteration별 정확도를 합하여 평균 정확도 계산
    print(model, '\n## 평균 검증 MAE:', np.mean(cv_mae))

  



Ridge 5 CV 시 최적 평균 로그 변환된 MAE 값: 0.2613, 최적 alpha:{'alpha': 10}
Lasso 5 CV 시 최적 평균 로그 변환된 MAE 값: 0.2738, 최적 alpha:{'alpha': 0.001}
ElasticNet 5 CV 시 최적 평균 로그 변환된 MAE 값: 0.2744, 최적 alpha:{'alpha': 0.001}
Ridge(alpha=10) 
## 평균 검증 MAE: 111.76181781722157
Lasso(alpha=0.001) 
## 평균 검증 MAE: 113.43521023705826
ElasticNet(alpha=0.001) 
## 평균 검증 MAE: 113.59834306637276


In [300]:
X.shape

(1495, 27)

In [298]:
X.columns

Index(['총세대수', '공가수', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '면적기준이상',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수', '20대', '30대', '40_50대', '고령층',
       '임대건물구분_상가', '임대건물구분_아파트', '공급유형_공공임대(50년)', '공급유형_공공임대(5년/10년/분납/분양)',
       '공급유형_국민임대/장기전세', '공급유형_영구임대', '공급유형_임대상가', '공급유형_행복주택', '전용면적별세대수',
       '자격유형_A', '자격유형_D', '자격유형_국민임대/장기전세_공급대상', '자격유형_영구임대_공급대상',
       '자격유형_행복주택_공급대상', '지역_세대당차량수'],
      dtype='object')

In [299]:
X['고령층']

단지_면적_코드
C1000_39.57    4.193956
C1000_46.7     4.193956
C1000_51.93    4.193956
C1004_14.1     4.098397
C1004_19.0     4.098397
                 ...   
C2680_36.63    4.573343
C2680_46.71    4.573343
C2692_36.64    4.155660
C2692_46.49    4.155660
C2692_51.59    4.155660
Name: 고령층, Length: 1495, dtype: float64