# import

In [73]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from IPython.display import Image
from sklearn.model_selection import train_test_split

np.set_printoptions(suppress=True, precision=3)

my_predictions = {}
r2_predictions = {}
my_pred = None
my_actual = None
my_name = None

colors = ['r', 'c', 'm', 'y', 'k', 'khaki', 'teal', 'orchid', 'sandybrown',
          'greenyellow', 'dodgerblue', 'deepskyblue', 'rosybrown', 'firebrick',
          'deeppink', 'crimson', 'salmon', 'darkred', 'olivedrab', 'olive', 
          'forestgreen', 'royalblue', 'indigo', 'navy', 'mediumpurple', 'chocolate',
          'gold', 'darkorange', 'seagreen', 'turquoise', 'steelblue', 'slategray', 
          'peru', 'midnightblue', 'slateblue', 'dimgray', 'cadetblue', 'tomato'
         ]

def plot_predictions(name_, pred, actual):
    df = pd.DataFrame({'prediction': pred, 'actual': y_test})
    df = df.sort_values(by='actual').reset_index(drop=True)

    plt.figure(figsize=(11, 8))
    plt.scatter(df.index, df['prediction'], marker='x', color='r')
    plt.scatter(df.index, df['actual'], alpha=0.7, marker='o', color='black')
    plt.title(name_, fontsize=15)
    plt.legend(['prediction', 'actual'], fontsize=12)
    plt.show()

def mse_eval(name_, pred, actual):
    global my_predictions, colors, my_pred, my_actual, my_name
    
    my_name = name_
    my_pred = pred
    my_actual = actual

    plot_predictions(name_, pred, actual)

    mse = mean_squared_error(pred, actual)
    my_predictions[name_] = mse

    y_value = sorted(my_predictions.items(), key=lambda x: x[1], reverse=True)
    
    df = pd.DataFrame(y_value, columns=['model', 'mse'])
    print(df)
    min_ = df['mse'].min() - 10
    max_ = df['mse'].max() + 10
    
    length = len(df) / 2
    
    plt.figure(figsize=(9, length))
    ax = plt.subplot()
    ax.set_yticks(np.arange(len(df)))
    ax.set_yticklabels(df['model'], fontsize=12)
    bars = ax.barh(np.arange(len(df)), df['mse'], height=0.3)
    
    for i, v in enumerate(df['mse']):
        idx = np.random.choice(len(colors))
        bars[i].set_color(colors[idx])
        ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=12, fontweight='bold', verticalalignment='center')
        
    plt.title('MSE Error', fontsize=16)
    plt.xlim(min_, max_)
    
    plt.show()
    
def r2_eval(name_, pred, actual):
    global r2_predictions, colors, my_pred, my_actual, my_name
    
    my_name = name_
    my_pred = pred
    my_actual = actual

    plot_predictions(name_, pred, actual)

    r2 = r2_score(pred, actual)
    r2_predictions[name_] = r2

    y_value = sorted(r2_predictions.items(), key=lambda x: x[1], reverse=True)
    
    df = pd.DataFrame(y_value, columns=['model', 'r2'])
    print(df)
    min_ = df['r2'].min() - 10
    max_ = df['r2'].max() + 10
    
    length = len(df) / 2
    
    plt.figure(figsize=(9, length))
    ax = plt.subplot()
    ax.set_yticks(np.arange(len(df)))
    ax.set_yticklabels(df['model'], fontsize=12)
    bars = ax.barh(np.arange(len(df)), df['r2'], height=0.3)
    
    for i, v in enumerate(df['r2']):
        idx = np.random.choice(len(colors))
        bars[i].set_color(colors[idx])
        ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=12, fontweight='bold', verticalalignment='center')
        
    plt.title('r2 score', fontsize=16)
    plt.xlim(min_, max_)
    
    plt.show()

def add_r2_model(name_, pred, actual):
    global r2_predictions, my_pred, my_actual, my_name
    my_name = name_
    my_pred = pred
    my_actual = actual
    
    r2 = r2_score(pred, actual)
    r2_predictions[name_] = r2
    
def add_model(name_, pred, actual):
    global my_predictions, my_pred, my_actual, my_name
    my_name = name_
    my_pred = pred
    my_actual = actual
    
    mse = mean_squared_error(pred, actual)
    my_predictions[name_] = mse

def remove_model(name_):
    global my_predictions
    try:
        del my_predictions[name_]
    except KeyError:
        return False
    return True

def plot_all():
    global my_predictions, my_pred, my_actual, my_name
    
    plot_predictions(my_name, my_pred, my_actual)
    
    y_value = sorted(my_predictions.items(), key=lambda x: x[1], reverse=True)
    
    df = pd.DataFrame(y_value, columns=['model', 'mse'])
    print(df)
    min_ = df['mse'].min() - 10
    max_ = df['mse'].max() + 10
    
    length = len(df) / 2
    
    plt.figure(figsize=(9, length))
    ax = plt.subplot()
    ax.set_yticks(np.arange(len(df)))
    ax.set_yticklabels(df['model'], fontsize=12)
    bars = ax.barh(np.arange(len(df)), df['mse'], height=0.3)
    
    for i, v in enumerate(df['mse']):
        idx = np.random.choice(len(colors))
        bars[i].set_color(colors[idx])
        ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=12, fontweight='bold', verticalalignment='center')
        
    plt.title('MSE Error', fontsize=16)
    plt.xlim(min_, max_)
    
    plt.show()
    
def r2_plot_all():
    global r2_predictions, my_pred, my_actual, my_name
    
    plot_predictions(my_name, my_pred, my_actual)
    
    y_value = sorted(r2_predictions.items(), key=lambda x: x[1], reverse=True)
    
    df = pd.DataFrame(y_value, columns=['model', 'r2'])
    print(df)
    min_ = df['r2'].min() - 10
    max_ = df['r2'].max() + 10
    
    length = len(df) / 2
    
    plt.figure(figsize=(9, length))
    ax = plt.subplot()
    ax.set_yticks(np.arange(len(df)))
    ax.set_yticklabels(df['model'], fontsize=12)
    bars = ax.barh(np.arange(len(df)), df['r2'], height=0.3)
    
    for i, v in enumerate(df['r2']):
        idx = np.random.choice(len(colors))
        bars[i].set_color(colors[idx])
        ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=12, fontweight='bold', verticalalignment='center')
        
    plt.title('r2 score', fontsize=16)
    plt.xlim(min_, max_)
    
    plt.show()

# #22-10-09#  

22-10-07에 서울시 특정구에 따른 전월세 예측을 했으나   
특정구의 동과 건물용도 컬럼을 라벨 인코딩 진행을 하여 모델링을 했다.  
컬럼들의 가중치를 계산 했을 때 건물용도와 동 컬럼에 상당히 큰 가중치가 부여되었다.  
coef_값이 큰 컬럼을 라벨 인코딩을 하여 학습을 하면 모델의 성능이 낮지 않을까?  
그것을 확인 하기 위해 라벨 인코딩 없는 동에 따른 전월세 예측을 한번 해보자.  

## 데이터 프레임 만들기

데이터가 가장 많은 송파구의 월세 = songpa_month  
데이터가 가장 많은 송파구의 전세 = songpa_rent  
데이터가 가장 적은 중구의 월세 = junggu_month  
데이터가 가장 적은 중구의 전세 = junggu_rent

In [74]:
df_2022 = pd.read_csv("data/서울시 부동산 전월세가 정보_2022.csv", encoding='cp949')
df_2021 = pd.read_csv("data/서울시 부동산 전월세가 정보_2021.csv", encoding='cp949')
df = pd.concat([df_2021,df_2022])
df = df.reset_index(drop=True)
df['층'] = df['층'].fillna(1)
columns = ['접수연도','자치구명','법정동명','층','전월세 구분','임대면적(㎡)','보증금(만원)','임대료(만원)','건물용도','신규갱신여부', '종전 보증금', '종전 임대료']
df = df[columns]
df = df[df['신규갱신여부']=='갱신']
df_rent = df[df['전월세 구분'] == '전세']
df_month = df[df['전월세 구분'] == '월세']
rent_columns=['접수연도','법정동명','층','임대면적(㎡)','보증금(만원)','건물용도','종전 보증금']
month_columns=['접수연도','법정동명','층','임대면적(㎡)','보증금(만원)','건물용도','종전 보증금','임대료(만원)','종전 임대료']
songpa_rent = df_rent[df_rent['자치구명'] == '송파구'][rent_columns]
songpa_month = df_month[df_month['자치구명'] == '송파구'][month_columns]
junggu_rent = df_rent[df_rent['자치구명'] == '중구'][rent_columns]
junggu_month = df_month[df_month['자치구명'] == '중구'][month_columns]

songpa_rent = pd.get_dummies(data=songpa_rent, columns=['건물용도'])
songpa_month = pd.get_dummies(data=songpa_month, columns=['건물용도'])
junggu_rent = pd.get_dummies(data=junggu_rent, columns=['건물용도'])
junggu_month = pd.get_dummies(data=junggu_month, columns=['건물용도'])

  df_2022 = pd.read_csv("data/서울시 부동산 전월세가 정보_2022.csv", encoding='cp949')
  df_2021 = pd.read_csv("data/서울시 부동산 전월세가 정보_2021.csv", encoding='cp949')


In [75]:
songpa_rent.head(2)

Unnamed: 0,접수연도,법정동명,층,임대면적(㎡),보증금(만원),종전 보증금,건물용도_단독다가구,건물용도_아파트,건물용도_연립다세대,건물용도_오피스텔
20,2021,잠실동,1.0,33.0,13000,7500.0,1,0,0,0
60,2021,석촌동,5.0,26.71,38850,37000.0,0,0,1,0


# 송파구의 월세,전세의 거래 수 1등(잠실)과 꼴지 동(마천)의 모델 만들고 성능보기

In [76]:
jamsil_rent = songpa_rent[songpa_rent['법정동명'] =='잠실동'].drop('법정동명',1)
jamsil_month = songpa_month[songpa_month['법정동명'] =='잠실동'].drop('법정동명',1)
machun_rent = songpa_rent[songpa_rent['법정동명'] =='마천동'].drop('법정동명',1)
machun_month = songpa_month[songpa_month['법정동명'] =='마천동'].drop('법정동명',1)

  jamsil_rent = songpa_rent[songpa_rent['법정동명'] =='잠실동'].drop('법정동명',1)
  jamsil_month = songpa_month[songpa_month['법정동명'] =='잠실동'].drop('법정동명',1)
  machun_rent = songpa_rent[songpa_rent['법정동명'] =='마천동'].drop('법정동명',1)
  machun_month = songpa_month[songpa_month['법정동명'] =='마천동'].drop('법정동명',1)


In [77]:
jamsil_month

Unnamed: 0,접수연도,층,임대면적(㎡),보증금(만원),종전 보증금,임대료(만원),종전 임대료,건물용도_단독다가구,건물용도_아파트,건물용도_연립다세대,건물용도_오피스텔
106,2021,3.0,53.82,26000,26000.0,3,0.0,0,0,1,0
240,2021,19.0,84.97,20000,20000.0,202,190.0,0,1,0,0
379,2021,8.0,27.68,6765,5000.0,127,120.0,0,1,0,0
724,2021,4.0,56.16,3100,2000.0,50,50.0,0,0,1,0
736,2021,5.0,43.37,13500,12500.0,20,20.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
886224,2022,11.0,43.44,1000,1000.0,130,120.0,0,0,0,1
886677,2022,1.0,48.00,10000,10000.0,30,30.0,1,0,0,0
886689,2022,1.0,48.00,10000,10000.0,30,20.0,1,0,0,0
887329,2022,5.0,31.48,3000,3000.0,83,80.0,0,0,1,0


In [78]:
jr_x_train, jr_x_test, jr_y_train, jr_y_test = train_test_split(jamsil_rent.drop('보증금(만원)', 1), jamsil_rent['보증금(만원)'], random_state=0)
jm_x_train, jm_x_test, jm_y_train, jm_y_test = train_test_split(jamsil_month.drop('임대료(만원)', 1), jamsil_month['임대료(만원)'], random_state=0)
mr_x_train, mr_x_test, mr_y_train, mr_y_test = train_test_split(machun_rent.drop('보증금(만원)', 1), machun_rent['보증금(만원)'], random_state=0)
mm_x_train, mm_x_test, mm_y_train, mm_y_test = train_test_split(machun_month.drop('보증금(만원)', 1), machun_month['보증금(만원)'], random_state=0)

  jr_x_train, jr_x_test, jr_y_train, jr_y_test = train_test_split(jamsil_rent.drop('보증금(만원)', 1), jamsil_rent['보증금(만원)'], random_state=0)
  jm_x_train, jm_x_test, jm_y_train, jm_y_test = train_test_split(jamsil_month.drop('임대료(만원)', 1), jamsil_month['임대료(만원)'], random_state=0)
  mr_x_train, mr_x_test, mr_y_train, mr_y_test = train_test_split(machun_rent.drop('보증금(만원)', 1), machun_rent['보증금(만원)'], random_state=0)
  mm_x_train, mm_x_test, mm_y_train, mm_y_test = train_test_split(machun_month.drop('보증금(만원)', 1), machun_month['보증금(만원)'], random_state=0)


In [79]:
from sklearn.linear_model import LinearRegression

model_jr = LinearRegression().fit(jr_x_train, jr_y_train)
model_jm = LinearRegression().fit(jm_x_train, jm_y_train)
model_mr = LinearRegression().fit(mr_x_train, mr_y_train)
model_mm = LinearRegression().fit(mm_x_train, mm_y_train)
pred_jr = model_jr.predict(jr_x_test)
pred_jm = model_jm.predict(jm_x_test)
pred_mr = model_mr.predict(mr_x_test)
pred_mm = model_mm.predict(mm_x_test)

In [80]:
print(f'잠실동 전세 mse: {mean_squared_error(pred_jr,jr_y_test)}, r2_score: {r2_score(pred_jr,jr_y_test)}')
print(f'잠실동 월세 mse: {mean_squared_error(pred_jm,jm_y_test)}, r2_score: {r2_score(pred_jm,jm_y_test)}')
print(f'마천동 전세 mse: {mean_squared_error(pred_mr,mr_y_test)}, r2_score: {r2_score(pred_mr,mr_y_test)}')
print(f'마천동 월세 mse: {mean_squared_error(pred_mm,mm_y_test)}, r2_score: {r2_score(pred_mm,mm_y_test)}')

잠실동 전세 mse: 60839762.00321649, r2_score: 0.9528881430327306
잠실동 월세 mse: 647.9822637644685, r2_score: 0.9107866584582449
마천동 전세 mse: 13417443.166785793, r2_score: 0.9023437266771444
마천동 월세 mse: 10341554.091166344, r2_score: 0.8404299417771969


# 예측 잘 하는지 확인

In [81]:
index = 4     #  이 값을 변경 해보세요
jr_x_test.iloc[[index]]
print(f'실제 값: {jr_y_test.iloc[index]}, 예측 값: {model_jr.predict(jr_x_test.iloc[[index]])}')

실제 값: 16800, 예측 값: [17029.105]


# 비교하기
송파구-잠실동-전세 데이터로만 만든 모델(동별 라벨인코딩 안한 것)과  
송파구-전세 데이터로 만든 모델(동별 라벨 인코딩 한 것)을  
같은 데이터로 비교 해 보자.  

In [87]:
# 라벨인코딩 데이터 만들기.
df_sub = df_rent[df_rent['자치구명'] == '송파구']
df_sub = pd.get_dummies(data=df_sub, columns=['건물용도'])
df_sub = pd.get_dummies(data=df_sub, columns=['법정동명'])
df_sub = df_sub.drop(['자치구명','전월세 구분','임대료(만원)','종전 임대료','신규갱신여부'],1)
df_sub

  df_sub = df_sub.drop(['자치구명','전월세 구분','임대료(만원)','종전 임대료','신규갱신여부'],1)


Unnamed: 0,접수연도,층,임대면적(㎡),보증금(만원),종전 보증금,건물용도_단독다가구,건물용도_아파트,건물용도_연립다세대,건물용도_오피스텔,법정동명_가락동,...,법정동명_문정동,법정동명_방이동,법정동명_삼전동,법정동명_석촌동,법정동명_송파동,법정동명_신천동,법정동명_오금동,법정동명_잠실동,법정동명_장지동,법정동명_풍납동
20,2021,1.0,33.00,13000,7500.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
60,2021,5.0,26.71,38850,37000.0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
76,2021,10.0,58.46,39900,38000.0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
130,2021,5.0,84.99,100000,90000.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
182,2021,28.0,84.98,84000,80000.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888631,2022,14.0,99.38,46200,44000.0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
889063,2022,2.0,40.44,23000,22000.0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
889257,2022,5.0,54.59,24000,23000.0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
889284,2022,4.0,42.75,17000,17000.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [88]:
x_train, x_test, y_train, y_test = train_test_split(df_sub.drop('보증금(만원)',1),df_sub['보증금(만원)'],random_state=0)

  x_train, x_test, y_train, y_test = train_test_split(df_sub.drop('보증금(만원)',1),df_sub['보증금(만원)'],random_state=0)


비교를 위한 새로운 데이터 프레임 만들기!!!!

In [97]:
df_rent = df_rent[df_rent['자치구명'] == '송파구']
df_vs = pd.get_dummies(data=df_rent,columns=['건물용도'])
df_vs = pd.get_dummies(data=df_vs,columns=['법정동명'])
df_vs = df_vs.loc[jr_x_test.index]
df_vs = df_vs.drop(['자치구명','전월세 구분','임대료(만원)','신규갱신여부','종전 임대료',],1)
df_vs

  df_vs = df_vs.drop(['자치구명','전월세 구분','임대료(만원)','신규갱신여부','종전 임대료',],1)


Unnamed: 0,접수연도,층,임대면적(㎡),보증금(만원),종전 보증금,건물용도_단독다가구,건물용도_아파트,건물용도_연립다세대,건물용도_오피스텔,법정동명_가락동,...,법정동명_문정동,법정동명_방이동,법정동명_삼전동,법정동명_석촌동,법정동명_송파동,법정동명_신천동,법정동명_오금동,법정동명_잠실동,법정동명_장지동,법정동명_풍납동
745763,2022,6.0,17.45,21000,20000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
744981,2022,1.0,100.30,48000,38000.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
777874,2022,5.0,18.10,26200,25000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
45442,2021,9.0,84.80,95000,85000.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
819437,2022,1.0,40.00,16800,16000.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2621,2021,9.0,82.61,40000,39000.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
803210,2022,9.0,80.39,50400,48000.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
874036,2022,3.0,84.82,115000,95000.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
205635,2021,5.0,25.74,18000,18000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [103]:
# 이게 원래 송파구의 모든동을 라벨 인코딩 했을때 모델
model = LinearRegression().fit(x_train, y_train)
pred = model.predict(df_vs.drop('보증금(만원)',1))
y_test = df_vs['보증금(만원)']
print(f'잠실동 전세 mse: {mean_squared_error(pred, y_test)}, r2_score: {r2_score(pred,y_test)}')

잠실동 전세 mse: 60820327.077423096, r2_score: 0.9527871134958638


  pred = model.predict(df_vs.drop('보증금(만원)',1))


In [104]:
# 잠실동 모델
print(f'잠실동 전세 mse: {mean_squared_error(pred_jr,jr_y_test)}, r2_score: {r2_score(pred_jr,jr_y_test)}')

잠실동 전세 mse: 60839762.00321649, r2_score: 0.9528881430327306


전세는 차이가 진짜 거의 없다  

그럼 월세를 해보자....ㅠ 지금 새벽 3시ㅠㅠㅠㅠㅠㅠㅠ

In [131]:
# 라벨인코딩 데이터 만들기.
df_sub1 = df_month[df_month['자치구명'] == '송파구']
df_sub1 = pd.get_dummies(data=df_sub1, columns=['건물용도'])
df_sub1 = pd.get_dummies(data=df_sub1, columns=['법정동명'])
df_sub1 = df_sub1.drop(['자치구명','전월세 구분','신규갱신여부'],1)
df_sub1 = df_sub1.dropna()
df_sub1

  df_sub1 = df_sub1.drop(['자치구명','전월세 구분','신규갱신여부'],1)


Unnamed: 0,접수연도,층,임대면적(㎡),보증금(만원),임대료(만원),종전 보증금,종전 임대료,건물용도_단독다가구,건물용도_아파트,건물용도_연립다세대,...,법정동명_문정동,법정동명_방이동,법정동명_삼전동,법정동명_석촌동,법정동명_송파동,법정동명_신천동,법정동명_오금동,법정동명_잠실동,법정동명_장지동,법정동명_풍납동
106,2021,3.0,53.82,26000,3,26000.0,0.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
240,2021,19.0,84.97,20000,202,20000.0,190.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
379,2021,8.0,27.68,6765,127,5000.0,120.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
724,2021,4.0,56.16,3100,50,2000.0,50.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
736,2021,5.0,43.37,13500,20,12500.0,20.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889357,2022,3.0,24.36,0,52,0.0,50.0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
889391,2022,5.0,30.00,1000,82,1000.0,78.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
889407,2022,20.0,35.24,38000,14,38000.0,10.0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
889490,2022,4.0,84.90,60000,75,60000.0,65.0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [132]:
x_train, x_test, y_train, y_test = train_test_split(df_sub1.drop('임대료(만원)',1),df_sub1['임대료(만원)'],random_state=0)

  x_train, x_test, y_train, y_test = train_test_split(df_sub1.drop('임대료(만원)',1),df_sub1['임대료(만원)'],random_state=0)


In [133]:
df_month = df_month[df_month['자치구명'] == '송파구']
df_vs = pd.get_dummies(data=df_month,columns=['건물용도'])
df_vs = pd.get_dummies(data=df_vs,columns=['법정동명'])
df_vs = df_vs.loc[jm_x_test.index]
df_vs = df_vs.drop(['자치구명','전월세 구분','신규갱신여부'],1)
df_vs

  df_vs = df_vs.drop(['자치구명','전월세 구분','신규갱신여부'],1)


Unnamed: 0,접수연도,층,임대면적(㎡),보증금(만원),임대료(만원),종전 보증금,종전 임대료,건물용도_단독다가구,건물용도_아파트,건물용도_연립다세대,...,법정동명_문정동,법정동명_방이동,법정동명_삼전동,법정동명_석촌동,법정동명_송파동,법정동명_신천동,법정동명_오금동,법정동명_잠실동,법정동명_장지동,법정동명_풍납동
824104,2022,21.0,84.99,40000,215,40000.0,200.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
833069,2022,3.0,55.20,5000,75,5000.0,71.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
35017,2021,8.0,116.19,23000,260,28000.0,210.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
145798,2021,14.0,59.88,40000,109,40000.0,100.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
167106,2021,28.0,27.68,47000,5,47000.0,0.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852596,2022,9.0,76.50,5000,136,5000.0,130.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
659310,2022,23.0,164.30,115000,210,90000.0,210.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
704175,2022,12.0,27.68,5000,127,5000.0,120.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
757750,2022,10.0,84.80,49800,140,45000.0,140.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [134]:
x_train.shape, y_train.shape

((2691, 23), (2691,))

In [137]:
# 이게 원래 송파구의 모든동을 라벨 인코딩 했을때 모델
model = LinearRegression().fit(x_train, y_train)
pred = model.predict(df_vs.drop('임대료(만원)',1))
y_test = df_vs['임대료(만원)']
print(f'잠실동 월세 mse: {mean_squared_error(pred, y_test)}, r2_score: {r2_score(pred,y_test)}')

잠실동 월세 mse: 607.328970812282, r2_score: 0.9082556105605546


  pred = model.predict(df_vs.drop('임대료(만원)',1))


In [140]:
# 잠실동 모델
print(f'잠실동 월세 mse: {mean_squared_error(pred_jm,jm_y_test)}, r2_score: {r2_score(pred_jm,jm_y_test)}')

잠실동 월세 mse: 647.9822637644685, r2_score: 0.9107866584582449


# 결론

전세,월세 둘다 mse는 조금 증가 하였고, r2_score도 조금 증가 하였다.  
error값은 증가했고, rw_score성능은 좋아졌다..? 이게 의미가 있다고 할수 있을까...?