In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
from sklearn.preprocessing import MinMaxScaler , PolynomialFeatures
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [3]:
# 한글 문제
# matplotlit의 기본 폰트에서 한글 지원되지 않기 때문에
# matplotlib의 폰트 변경 필요
import platform

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':  # 맥OS 
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':  # 윈도우
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system...  sorry~~~')

In [4]:
data=glob('./data/final*')

In [5]:
# 연도별 데이터 통합
def load():
    data=glob('./data/final*')
    total=[]
    for i in data:
        total.append(pd.read_csv(i,encoding='cp949'))
    tot=pd.concat(total,axis=0)
    return tot

In [6]:
tot=load()

In [7]:
tot.head()

Unnamed: 0,DAY,TIME,STATION_NAME,GU,UP_POP,DOWN_POP,RAIN,HUMN,SNOW,SKY,ONDO,WINDD,WINDS,PM10,PM25
0,19/01/01,10:00,가락시장,송파구,300,307,0.0,44,0.0,3,-4.9,13,1.6,47,35
1,19/01/01,11:00,가락시장,송파구,377,450,0.0,41,0.0,1,-3.7,13,2.5,40,25
2,19/01/01,12:00,가락시장,송파구,527,498,0.0,47,0.0,3,-2.8,13,2.8,42,28
3,19/01/01,13:00,가락시장,송파구,485,453,0.0,49,0.0,3,-1.5,12,2.7,45,30
4,19/01/01,14:00,가락시장,송파구,562,496,0.0,51,0.0,3,-1.7,13,3.5,42,26


In [53]:
len(tot['STATION_NAME'].unique())

223

In [66]:
table=pd.pivot_table(tot,index=['STATION_NAME','GU'],values=['UP_POP']).reset_index()

In [70]:
len(table['GU'].unique())

25

# 전처리 함수

In [8]:
# # 시간대별 지하철 이용객수 그래프
# def visual(df):
#     df2=df.copy()
#     df2=feat(df2)
#     df2['TIME']=[int(i.split(':')[0]) for i in df2['TIME']]
#     pivot=pd.pivot_table(df2,index='TIME',values='TOTAL',aggfunc='sum')

#     sns.barplot(pivot.index,pivot['TOTAL'])

In [9]:
# 합계와 시간대 컬럼 생성
def feat(dt):
    df=dt.copy()
    df['TOTAL']=df['UP_POP']+df['DOWN_POP']
    df['T']=[0 if (int(i.split(':')[0])>19) or (int(i.split(':')[0])<17) else 1 for i in df['TIME']]
    return df

In [10]:
# 가중치 컬럼 생성 후 필요없는 컬럼 제거
def weight(dt):
    df=dt.copy()
    mean=pd.pivot_table(df,index='STATION_NAME',values='TOTAL')
#     mm=MinMaxScaler()
#     w=mm.fit_transform(mean)
    mean['W']=mean
    df=pd.merge(df.drop(['UP_POP','DOWN_POP'],axis=1),mean.drop(['TOTAL'],axis=1),on='STATION_NAME')
    return df
    

In [11]:
# 로그변환
def log_trans(dt):
    df=dt.copy()
    feat=df.columns.drop('ONDO')
    df[feat]=np.log1p(df[feat])
    return df
    

In [12]:
# 모든함수 적용
def pre(dt):
    df=dt.copy()
    df=feat(df)
    df=weight(df).drop(['DAY','GU','TIME'],axis=1).set_index('STATION_NAME')
    df=log_trans(df)
    X=df.drop('TOTAL',axis=1)
    Y=df[['TOTAL']]
    return X,Y

In [13]:
X,Y=pre(tot)

In [14]:
X.head()

Unnamed: 0_level_0,RAIN,HUMN,SNOW,SKY,ONDO,WINDD,WINDS,PM10,PM25,T,W
STATION_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
가락시장,0.0,3.806662,0.0,1.386294,-4.9,2.639057,0.955511,3.871201,3.583519,0.0,7.848104
가락시장,0.0,3.73767,0.0,0.693147,-3.7,2.639057,1.252763,3.713572,3.258097,0.0,7.848104
가락시장,0.0,3.871201,0.0,1.386294,-2.8,2.639057,1.335001,3.7612,3.367296,0.0,7.848104
가락시장,0.0,3.912023,0.0,1.386294,-1.5,2.564949,1.308333,3.828641,3.433987,0.0,7.848104
가락시장,0.0,3.951244,0.0,1.386294,-1.7,2.639057,1.504077,3.7612,3.295837,0.0,7.848104


In [15]:
Y.head()

Unnamed: 0_level_0,TOTAL
STATION_NAME,Unnamed: 1_level_1
가락시장,6.410175
가락시장,6.719013
가락시장,6.933423
가락시장,6.844815
가락시장,6.96508


In [23]:
fin=pd.concat([X,Y],axis=1)

In [24]:
fin.to_csv('fin.csv')

# 회귀 모델링

In [16]:
def lr_eval(X,Y,method=None):
    global X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)
    if method==None:
        lr=LinearRegression()
        lr.fit(X_train,y_train)
        print(lr.coef_)
        pred=lr.predict(X_train)
        pred_t=lr.predict(X_test)
        

    elif method=='Poly':
        poly=Pipeline([('poly',PolynomialFeatures(degree=2)),
                      ('linear',LinearRegression())])
        lr=poly.fit(X_train,y_train)
        
        pred=lr.predict(X_train)
        pred_t=lr.predict(X_test)

    print('Train Data MSE : {0}, MAE : {1}'.format(mean_squared_error(y_train,pred),mean_absolute_error(y_train,pred)))
    print('Test Data MSE : {0}, MAE : {1}'.format(mean_squared_error(y_test,pred_t),mean_absolute_error(y_test,pred_t)))
    
    return lr

In [17]:
lr=lr_eval(X,Y)  # 일반 선형 성능

[[ 0.01662202 -0.24922362  0.28697457  0.09116823  0.00438368  0.0286766
  -0.01897228  0.03766752  0.01271825  0.7246946   1.0341881 ]]
Train Data MSE : 0.26201158464301877, MAE : 0.3816141805680984
Test Data MSE : 0.26139047362832624, MAE : 0.3813494109249746


In [18]:
lr2=lr_eval(X,Y,method='Poly') # 2차 다항회귀 성능

Train Data MSE : 0.25651130030614455, MAE : 0.3782123513426095
Test Data MSE : 0.2562667878876376, MAE : 0.37775360988905693


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)

In [20]:
poly = PolynomialFeatures(degree=2).fit_transform(X_train,y_train)
lr2 = LinearRegression()
lr2.fit(poly,y_train)

LinearRegression()

In [21]:
 print(lr2.coef_)

[[-7.28652902e-14 -1.85344113e+00  9.77311223e-01  1.29939455e+01
  -1.42199497e+00  4.24636297e-02  2.42577481e-02 -7.84185557e-01
  -5.87210650e-01 -1.49315460e-02  6.52519366e+05  1.51004628e+00
  -4.70799584e-02  3.40262729e-01 -9.82526900e-01  2.17883716e-01
  -5.58031313e-03  3.51501378e-02 -4.59348943e-02 -8.14768745e-02
   5.97966272e-02 -4.26867124e-02  4.00957048e-02 -2.06362266e-01
  -2.09780937e+00  1.92295223e-01 -4.46350065e-03 -4.40780395e-02
   1.06092111e-01  1.26370072e-01 -3.22114795e-02  1.61568992e-01
  -3.29794998e-02  9.84464949e-01 -1.51687661e+00  4.80405581e-02
   1.09349736e+00 -2.03538569e+00 -2.32718470e-01 -1.51131079e-01
   5.22717338e-01  5.12602829e-02  2.95463697e-01 -1.19819235e-03
   2.37911371e-02  4.39738010e-03 -7.04287018e-02  2.13294004e-02
  -9.00265423e-03  3.61471475e-02 -1.22908440e-04  8.24016793e-04
  -3.96360426e-03 -1.70339831e-03  9.08182278e-04 -3.23114131e-03
  -1.17156166e-03  6.19942049e-02  7.37560681e-03 -1.14356527e-02
   9.61674

In [25]:
import pickle

In [26]:
save_lr = pickle.dumps(lr2)

In [27]:
load_lr = pickle.loads(save_lr)

In [28]:
load_lr

LinearRegression()

In [31]:
import joblib
# 모델 파일저장
joblib.dump(lr, 'weater_lr.pkl')

['weater_lr.pkl']

In [32]:
# 모델 로드
lr_load_joblib = joblib.load('weater_lr.pkl')

In [33]:
np.expm1(lr_load_joblib.predict(X_test))

array([[ 962.62696996],
       [3642.07035984],
       [2303.9896469 ],
       ...,
       [1121.97361226],
       [ 562.41808794],
       [4196.70805569]])

In [34]:
X_test

Unnamed: 0_level_0,RAIN,HUMN,SNOW,SKY,ONDO,WINDD,WINDS,PM10,PM25,T,W
STATION_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
옥수,0.0,3.295837,0.0,0.693147,18.9,2.564949,1.481605,3.850148,3.433987,0.000000,6.994415
을지로3가,0.0,3.555348,0.0,0.693147,11.2,2.639057,1.481605,3.637586,3.135494,0.693147,7.899150
명동,0.0,3.258097,0.0,1.609438,8.7,2.564949,1.029619,3.583519,3.295837,0.000000,7.794196
망원,0.0,3.891820,0.0,0.693147,-0.5,2.564949,0.788457,4.060443,3.663562,0.000000,7.317906
도림천,0.0,4.262680,0.0,0.693147,20.8,2.639057,1.223775,3.367296,3.295837,0.000000,6.250647
...,...,...,...,...,...,...,...,...,...,...,...
양재,0.0,4.262680,0.0,1.609438,30.0,2.639057,0.916291,2.890372,2.397895,0.693147,8.075208
청담,0.0,3.931826,0.0,1.386294,11.5,1.098612,1.740466,2.302585,2.197225,0.000000,7.421998
장승배기,0.0,3.988984,0.0,1.609438,5.3,0.000000,0.182322,4.343805,4.094345,0.000000,7.307483
명일,0.0,4.127134,0.0,0.693147,9.1,2.397895,0.741937,3.526361,2.708050,0.000000,6.729128


In [35]:
np.expm1(y_test)

Unnamed: 0_level_0,TOTAL
STATION_NAME,Unnamed: 1_level_1
옥수,977.0
을지로3가,652.0
명동,4264.0
망원,1410.0
도림천,30.0
...,...
양재,4893.0
청담,1485.0
장승배기,3218.0
명일,403.0


In [36]:
tot.reset_index(inplace=True, drop=True)

In [37]:
tot

Unnamed: 0,DAY,TIME,STATION_NAME,GU,UP_POP,DOWN_POP,RAIN,HUMN,SNOW,SKY,ONDO,WINDD,WINDS,PM10,PM25
0,19/01/01,10:00,가락시장,송파구,300,307,0.0,44,0.0,3,-4.9,13,1.6,47,35
1,19/01/01,11:00,가락시장,송파구,377,450,0.0,41,0.0,1,-3.7,13,2.5,40,25
2,19/01/01,12:00,가락시장,송파구,527,498,0.0,47,0.0,3,-2.8,13,2.8,42,28
3,19/01/01,13:00,가락시장,송파구,485,453,0.0,49,0.0,3,-1.5,12,2.7,45,30
4,19/01/01,14:00,가락시장,송파구,562,496,0.0,51,0.0,3,-1.7,13,3.5,42,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3177253,21/12/31,18:00,효창공원앞,용산구,553,541,0.0,32,0.0,1,-5.9,10,2.2,23,9
3177254,21/12/31,19:00,효창공원앞,용산구,235,337,0.0,34,0.0,1,-6.4,11,2.2,27,8
3177255,21/12/31,20:00,효창공원앞,용산구,165,194,0.0,35,0.0,1,-6.3,14,4.1,26,11
3177256,21/12/31,21:00,효창공원앞,용산구,179,295,0.0,36,0.0,1,-6.7,14,4.8,26,11


In [38]:
test = weight(feat(tot))

In [39]:
station = test['STATION_NAME'].unique()

In [40]:
W = test['W'].unique()
station = test['STATION_NAME'].unique()

station_W = pd.DataFrame(station,W, columns = ['STATION_NAME'])

station_W.reset_index(inplace=True)

station_W.set_index('STATION_NAME',inplace=True)

In [41]:
goo = test.set_index('GU')

goo_list = goo[['STATION_NAME']].drop_duplicates()

goo_list = goo_list.reset_index(drop=False).set_index('STATION_NAME')

station_W['GU'] = goo_list
station_W.reset_index(inplace=True)
station_W.columns = ['STATION_NAME', 'W', 'GU']

In [42]:
station_W.to_csv('./data/station_W.csv')

In [43]:
station_W[station_W['GU'] == '강남구'].set_index('STATION_NAME',drop=False)[['W']]

Unnamed: 0_level_0,W
STATION_NAME,Unnamed: 1_level_1
강남,7786.252106
강남구청,1422.483296
논현,1605.713925
대청,1010.055376
대치,1237.350856
도곡,538.156794
매봉,971.201221
삼성,4250.175042
선릉,4223.16276
수서,1713.579239


In [44]:
tot

Unnamed: 0,DAY,TIME,STATION_NAME,GU,UP_POP,DOWN_POP,RAIN,HUMN,SNOW,SKY,ONDO,WINDD,WINDS,PM10,PM25
0,19/01/01,10:00,가락시장,송파구,300,307,0.0,44,0.0,3,-4.9,13,1.6,47,35
1,19/01/01,11:00,가락시장,송파구,377,450,0.0,41,0.0,1,-3.7,13,2.5,40,25
2,19/01/01,12:00,가락시장,송파구,527,498,0.0,47,0.0,3,-2.8,13,2.8,42,28
3,19/01/01,13:00,가락시장,송파구,485,453,0.0,49,0.0,3,-1.5,12,2.7,45,30
4,19/01/01,14:00,가락시장,송파구,562,496,0.0,51,0.0,3,-1.7,13,3.5,42,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3177253,21/12/31,18:00,효창공원앞,용산구,553,541,0.0,32,0.0,1,-5.9,10,2.2,23,9
3177254,21/12/31,19:00,효창공원앞,용산구,235,337,0.0,34,0.0,1,-6.4,11,2.2,27,8
3177255,21/12/31,20:00,효창공원앞,용산구,165,194,0.0,35,0.0,1,-6.3,14,4.1,26,11
3177256,21/12/31,21:00,효창공원앞,용산구,179,295,0.0,36,0.0,1,-6.7,14,4.8,26,11


In [45]:
result = lr_load_joblib.predict(test_garock)

NameError: name 'test_garock' is not defined

In [47]:
test_tot = test['TOTAL']

In [48]:
import itertools

In [49]:
result = list(itertools.chain(*result))

NameError: name 'result' is not defined

In [None]:
(test_tot-result).min()

In [None]:
tot[tot['STATION_NAME'] == '학여울'][tot['TIME'] == '18:00'][tot['DAY'] == '21/11/06']