In [98]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [99]:
from sklearn.preprocessing import MinMaxScaler , PolynomialFeatures
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [100]:
# 한글 문제
# matplotlit의 기본 폰트에서 한글 지원되지 않기 때문에
# matplotlib의 폰트 변경 필요
import platform

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':  # 맥OS 
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':  # 윈도우
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system...  sorry~~~')

In [101]:
data=glob('./data/final*')

In [102]:
# 연도별 데이터 통합
def load():
    data=glob('./data/final*')
    total=[]
    for i in data:
        total.append(pd.read_csv(i,encoding='cp949'))
    tot=pd.concat(total,axis=0)
    return tot

In [103]:
tot=load()

In [104]:
tot.head()

Unnamed: 0,DAY,TIME,STATION_NAME,GU,UP_POP,DOWN_POP,RAIN,HUMN,SNOW,SKY,ONDO,WINDD,WINDS,PM10,PM25
0,19/01/01,10:00,가락시장,송파구,300,307,0.0,44,0.0,3,-4.9,13,1.6,47,35
1,19/01/01,11:00,가락시장,송파구,377,450,0.0,41,0.0,1,-3.7,13,2.5,40,25
2,19/01/01,12:00,가락시장,송파구,527,498,0.0,47,0.0,3,-2.8,13,2.8,42,28
3,19/01/01,13:00,가락시장,송파구,485,453,0.0,49,0.0,3,-1.5,12,2.7,45,30
4,19/01/01,14:00,가락시장,송파구,562,496,0.0,51,0.0,3,-1.7,13,3.5,42,26


# 전처리 함수

In [105]:
# # 시간대별 지하철 이용객수 그래프
# def visual(df):
#     df2=df.copy()
#     df2=feat(df2)
#     df2['TIME']=[int(i.split(':')[0]) for i in df2['TIME']]
#     pivot=pd.pivot_table(df2,index='TIME',values='TOTAL',aggfunc='sum')

#     sns.barplot(pivot.index,pivot['TOTAL'])

In [106]:
# 합계와 시간대 컬럼 생성
def feat(dt):
    df=dt.copy()
    df['TOTAL']=df['UP_POP']+df['DOWN_POP']
    df['T']=[0 if (int(i.split(':')[0])>19) or (int(i.split(':')[0])<17) else 1 for i in df['TIME']]
    return df

In [107]:
# 가중치 컬럼 생성 후 필요없는 컬럼 제거
def weight(dt):
    df=dt.copy()
    mean=pd.pivot_table(df,index='STATION_NAME',values='TOTAL')
    mm=MinMaxScaler()
    w=mm.fit_transform(mean)
    mean['W']=w
    df=pd.merge(df.drop(['UP_POP','DOWN_POP'],axis=1),mean.drop(['TOTAL'],axis=1),on='STATION_NAME')
    return df
    

In [108]:
# 로그변환
def log_trans(dt):
    df=dt.copy()
    feat=df.columns.drop('ONDO')
    df[feat]=np.log1p(df[feat])
    return df
    

In [109]:
# 모든함수 적용
def pre(dt):
    df=dt.copy()
    df=feat(df)
    df=weight(df).drop(['DAY','GU','TIME'],axis=1).set_index('STATION_NAME')
    df=log_trans(df)
    X=df.drop('TOTAL',axis=1)
    Y=df[['TOTAL']]
    return X,Y

In [110]:
X,Y=pre(tot)

In [111]:
X.head()

Unnamed: 0_level_0,RAIN,HUMN,SNOW,SKY,ONDO,WINDD,WINDS,PM10,PM25,T,W
STATION_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
가락시장,0.0,3.806662,0.0,1.386294,-4.9,2.639057,0.955511,3.871201,3.583519,0.0,0.245604
가락시장,0.0,3.73767,0.0,0.693147,-3.7,2.639057,1.252763,3.713572,3.258097,0.0,0.245604
가락시장,0.0,3.871201,0.0,1.386294,-2.8,2.639057,1.335001,3.7612,3.367296,0.0,0.245604
가락시장,0.0,3.912023,0.0,1.386294,-1.5,2.564949,1.308333,3.828641,3.433987,0.0,0.245604
가락시장,0.0,3.951244,0.0,1.386294,-1.7,2.639057,1.504077,3.7612,3.295837,0.0,0.245604


In [112]:
Y.head()

Unnamed: 0_level_0,TOTAL
STATION_NAME,Unnamed: 1_level_1
가락시장,6.410175
가락시장,6.719013
가락시장,6.933423
가락시장,6.844815
가락시장,6.96508


# 회귀 모델링

In [113]:
def lr_eval(X,Y,method=None):
    global X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)
    if method==None:
        lr=LinearRegression()
        lr.fit(X_train,y_train)

        pred=lr.predict(X_train)
        pred_t=lr.predict(X_test)
        

    elif method=='Poly':
        poly=Pipeline([('poly',PolynomialFeatures(degree=2)),
                      ('linear',LinearRegression())])
        lr=poly.fit(X_train,y_train)
        
        pred=lr.predict(X_train)
        pred_t=lr.predict(X_test)

    print('Train Data MSE : {0}, MAE : {1}'.format(mean_squared_error(y_train,pred),mean_absolute_error(y_train,pred)))
    print('Test Data MSE : {0}, MAE : {1}'.format(mean_squared_error(y_test,pred_t),mean_absolute_error(y_test,pred_t)))
    return lr

In [114]:
lr=lr_eval(X,Y)  # 일반 선형 성능

Train Data MSE : 0.31122011736087657, MAE : 0.42121254206040715
Test Data MSE : 0.31119423685994974, MAE : 0.42122929647393276


In [115]:
lr2=lr_eval(X,Y,method='Poly') # 2차 다항회귀 성능

Train Data MSE : 0.2705724566063856, MAE : 0.38956244913749205
Test Data MSE : 0.2708257926395359, MAE : 0.38953001265036363
