In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import timeit
import sklearn
import warnings
warnings.filterwarnings('ignore')
import sys
plt.rc("font", family="Malgun Gothic")

In [2]:
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [3]:
apt_price = pd.read_csv('아파트_전처리.csv',encoding='utf8')

In [7]:
a = apt_price[apt_price['계약년']!=2022]
b = apt_price[apt_price['계약년']==2022]

In [20]:
b1 =  b.drop(columns='거래금액(만원)')
b1

Unnamed: 0,전용면적(㎡),계약일,층,건축년도,구,동,평,계약년,계약월,한강,건물나이,재건축
807176,4.394079,12,2.197225,1987,24,293,3.310543,2022,4,0.0,35,0.693147
807177,4.394079,21,1.945910,1987,24,293,3.310543,2022,4,0.0,35,0.693147
807178,4.394079,27,1.945910,1987,24,293,3.310543,2022,5,0.0,35,0.693147
807188,4.637831,1,2.890372,2020,24,293,3.549617,2022,4,0.0,2,0.000000
807189,4.920419,2,3.091042,2020,24,293,3.826465,2022,5,0.0,2,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
824115,4.106932,27,3.218876,1997,3,67,3.030134,2022,3,0.0,25,0.000000
824116,4.106932,20,2.079442,1997,3,67,3.030134,2022,7,0.0,25,0.000000
824117,4.443004,27,2.833213,1997,3,67,3.356897,2022,7,0.0,25,0.000000
824119,4.450736,9,2.484907,2003,3,67,3.367296,2022,4,0.0,19,0.000000


In [8]:
X = a.drop(columns='거래금액(만원)')
y = a['거래금액(만원)']

In [4]:
X = apt_price.drop(columns='거래금액(만원)')
y = apt_price['거래금액(만원)']

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3)

In [9]:
forest = RandomForestRegressor(n_estimators = 30, n_jobs = -1)
xgboost = xgb.XGBRegressor()
lightgbm = lgb.LGBMRegressor(num_leaves = 100, min_data_in_leaf = 15, max_depth=6,
                            learning_rate = 0.1, min_child_samples = 30, feature_fraction=0.9, bagging_freq= 1,
                            bagging_fraction = 0.9, bagging_seed = 11, lambda_l1 = 0.1, verbosity = -1 )

In [10]:
models = [{'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'},
         {'model':forest, 'name' : 'RandomForest'}]

def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x.values, y)
    
    predictions = np.column_stack([m['model'].predict(sub_x.values) for m in models])
    return predictions

In [11]:
forest.fit(X_train.values, y_train)
predictions = forest.predict(X_test.values)

In [12]:
predictions

array([10.67494971, 11.45793139, 11.44880643, ..., 11.50008791,
       10.61643566, 11.42130991])

In [13]:
y_test_pred = AveragingBlending(models, X_train, y_train, X_test)



In [14]:
predictions = (y_test_pred[:, 0]*0.05 + y_test_pred[:, 1]*0.1 + y_test_pred[:, 2]*0.85)
predictions

array([10.6971001 , 11.45483503, 11.42717919, ..., 11.49230557,
       10.62409102, 11.44826764])

In [15]:
c1 = models[0]['model'].predict(X_test.values)
c2 = models[1]['model'].predict(X_test.values)
c3 = models[2]['model'].predict(X_test.values)

In [30]:
print(models[0]['model'].score(X_test,y_test))
print(models[1]['model'].score(X_test,y_test))
print(models[2]['model'].score(X_test,y_test))

0.9539238233375416
0.9354042972251623
0.9757063744754952


In [16]:
print(models[0]['model'].score(X_test,y_test))
print(models[1]['model'].score(X_test,y_test))
print(models[2]['model'].score(X_test,y_test))

0.9550230191590715
0.9358591806010113
0.9753346614802751


In [18]:
p1 = (c1*0.05 + c2*0.1 + c3*0.85)
np.expm1(p1)

array([44226.41431412, 94355.46164601, 91781.707916  , ...,
       97958.12601577, 41112.46867309, 93737.81882719])

## Test 예측값

In [20]:
d2 = pd.DataFrame(np.expm1(p1).reshape(-1,1),index=X_test.index,columns=['거래금액(만원)'])

In [21]:
pd.concat([X_test,d2],axis=1)

Unnamed: 0,전용면적(㎡),계약일,층,건축년도,구,동,평,계약년,계약월,한강,건물나이,재건축,거래금액(만원)
672940,4.109233,24,1.945910,2000,5,82,3.034953,2020,2,0.000000,20,0.000000,44226.414314
366309,4.424248,16,2.639057,1979,21,306,3.339322,2016,7,0.000000,37,0.693147,94355.461646
676197,4.452252,19,2.639057,1996,5,202,3.367296,2020,6,0.000000,24,0.000000,91781.707916
115948,4.453998,4,1.945910,1988,2,83,3.367296,2014,2,0.000000,26,0.000000,36665.110700
388542,4.453998,6,2.708050,2008,17,184,3.367296,2016,8,0.000000,8,0.000000,55070.125929
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795942,4.243440,16,2.079442,2004,15,272,3.165475,2021,7,0.693147,17,0.000000,50905.358032
124904,4.453870,25,1.945910,2004,16,230,3.367296,2014,7,0.000000,10,0.000000,49010.565002
586228,4.850075,31,2.564949,2005,13,312,3.756538,2017,10,0.000000,12,0.000000,97958.126016
227762,4.371850,22,1.945910,2003,16,195,3.288402,2015,6,0.000000,12,0.000000,41112.468673


In [25]:
np.expm1(X_test['전용면적(㎡)'])

672940     59.900
366309     82.450
676197     84.820
115948     84.970
388542     84.970
           ...   
795942     68.647
124904     84.959
586228    126.750
227762     78.190
37689     131.960
Name: 전용면적(㎡), Length: 247237, dtype: float64

## Test 실제값

In [23]:
np.expm1(y_test)

672940    44800.0
366309    94700.0
676197    92000.0
115948    39900.0
388542    51000.0
           ...   
795942    68000.0
124904    46800.0
586228    90000.0
227762    40500.0
37689     81896.0
Name: 거래금액(만원), Length: 247237, dtype: float64

In [48]:
원본=pd.read_csv('../아파트 실거래 병합/apt_price.csv',encoding='utf8')
원본.columns

Index(['시군구', '번지', '본번', '부번', '단지명', '전용면적(㎡)', '계약년월', '계약일', '거래금액(만원)',
       '층', '건축년도', '도로명', '거래유형', '중개사소재지', '구', '동', '평', '거래금액_억원', '계약년',
       '계약월', '평형'],
      dtype='object')

In [49]:
원본2022 = 원본[원본['계약년']==2022]
원본2022['거래금액(만원)'] = np.round(d2['거래금액(만원)'],-3).astype('int64')
원본2022.drop(columns=['시군구','본번','부번','거래유형','중개사소재지','거래금액_억원'],inplace=True)

Unnamed: 0,번지,단지명,전용면적(㎡),계약년월,계약일,거래금액(만원),층,건축년도,도로명,구,동,평,계약년,계약월,평형
807176,658-1,개포6차우성아파트1동~8동,79.97,202204,12,200000,4,1987,언주로 3,강남구,개포동,26.4,2022,4,20평대
807177,658-1,개포6차우성아파트1동~8동,79.97,202204,21,201000,2,1987,언주로 3,강남구,개포동,26.4,2022,4,20평대
807178,658-1,개포6차우성아파트1동~8동,79.97,202205,27,210000,2,1987,언주로 3,강남구,개포동,26.4,2022,5,20평대
807188,1282,개포래미안포레스트,102.32,202204,1,317000,13,2020,개포로 264,강남구,개포동,33.8,2022,4,30평대
807189,1282,개포래미안포레스트,136.06,202205,2,381000,17,2020,개포로 264,강남구,개포동,44.9,2022,5,40평대
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824115,450,한신아파트(103~109),59.76,202203,27,66000,20,1997,동일로 752,중랑구,중화동,19.7,2022,3,10평대
824116,450,한신아파트(103~109),59.76,202207,20,65000,3,1997,동일로 752,중랑구,중화동,19.7,2022,7,10평대
824117,450,한신아파트(103~109),84.03,202207,27,77000,12,1997,동일로 752,중랑구,중화동,27.7,2022,7,20평대
824119,274-51,한영(101),84.69,202204,9,68000,7,2003,동일로144길 74,중랑구,중화동,28.0,2022,4,20평대


In [51]:
원본2022.reset_index(inplace=True)

In [55]:
원본2022['단지명'].nunique()

2860

In [56]:
원본2022.to_csv('아파트2022_예측값.csv',index=False)

In [26]:
import joblib

In [27]:
joblib.dump(models[0]['model'], 'XGB1.pkl')
joblib.dump(models[1]['model'], 'LGBM1.pkl')
joblib.dump(models[2]['model'], 'RandomForest1.pkl')

['RandomForest1.pkl']