In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib

font_path = r'/Users/jihyunlee/Library/Fonts/NanumSquareR.ttf'
font_name = fm.FontProperties(fname=font_path).get_name()
matplotlib.rc('font', family=font_name)

In [None]:
subway_data = pd.read_csv("subway_data1.csv")

In [None]:
subway_data["snack"] = subway_data["toast"] + subway_data["sandwich"]
subway_data = subway_data.drop(["pop","sandwich", "toast", "school_hu"], axis=1)

In [None]:
y2017 = pd.read_csv("17년_일별.csv", encoding="cp949")
y2018 = pd.read_csv("18년_일별.csv", encoding="cp949")
y2019 = pd.read_csv("19년_일별.csv", encoding="cp949")
y2020 = pd.read_csv("20년_일별.csv", encoding="cp949")
y2021 = pd.read_csv("21년_일별.csv", encoding="cp949")

In [None]:
#train
bf_y_train = pd.concat([y2017,y2018])
bf_train = pd.merge(bf_y_train, subway_data, left_on="역명", right_on="station", how="left")

In [None]:
bf_train["bank"].apply(lambda x: np.log1p(x)).describe()

In [None]:
bf_train = bf_train.drop("역명", axis=1)
bf_train["이용자 수"] = bf_train["이용자 수"].apply(lambda x: float(x))
bf_train["날짜"] = bf_train["날짜"].apply(lambda x: pd.to_datetime(x))

bf_train["lagingindex"] = bf_train["agingindex"].apply(lambda x:np.log(x))
bf_train["l이용자수"] = bf_train["이용자 수"].apply(lambda x:np.log1p(x))
bf_train["lbank"] = bf_train["bank"].apply(lambda x:np.log1p(x))
bf_train = bf_train.drop(["agingindex", "이용자 수", "bank"], axis=1)
bf_train = pd.concat([bf_train, pd.get_dummies(bf_train["station"])], axis=1)
bf_train["date"] = bf_train["날짜"].apply(lambda x: int(x.strftime("%j")))
bf_train = bf_train.drop(["station","날짜"], axis=1)

In [None]:
#test
bf_test = pd.merge(y2019,subway_data, left_on="역명", right_on="station", how="left")

bf_test = bf_test.drop("역명", axis=1)
bf_test["이용자 수"] = bf_test["이용자 수"].apply(lambda x: float(x))
bf_test["날짜"] = bf_test["날짜"].apply(lambda x: pd.to_datetime(x))

bf_test["lagingindex"] = bf_test["agingindex"].apply(lambda x:np.log(x))
bf_test["l이용자수"] = bf_test["이용자 수"].apply(lambda x:np.log(x))
bf_test["lbank"] = bf_test["bank"].apply(lambda x:np.log1p(x))
bf_test = bf_test.drop(["agingindex", "이용자 수", "bank"], axis=1)
bf_test = pd.concat([bf_test, pd.get_dummies(bf_test["station"])], axis=1)
bf_test["date"] = bf_test["날짜"].apply(lambda x: int(x.strftime("%j")))
bf_test = bf_test.drop(["station","날짜"], axis=1)

In [None]:
#평가 스케일
def rmsle(y, y_,convertExp=True):
    if convertExp:
        y = np.exp(y), 
        y_ = np.exp(y_)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

from sklearn import metrics
rmsle_scorer = metrics.make_scorer(rmsle)

# 1. 코로나 이전 - 단일 모델

In [None]:
#변수 변환전 plot

In [None]:
# import seaborn as sns
# col1 = ['날짜', 'station', 'agingindex', 'bank', 'bar', 'bus_stop','이용자 수']
# col2 = ['bus_terminal', 'dep_store', 'exit', 'hospital', 'library','이용자 수']
# col3 = ['oliveyoung', 'school_emhu', 'ssm', 'starbucks', 'theatre', 'snack','이용자 수']
# sns.pairplot(bf_train[col1])
# plt.tight_layout()
# plt.show() 

In [None]:
# sns.pairplot(bf_train[col2])

In [None]:
# sns.pairplot(bf_train[col3])

## Ridge

In [None]:
bf_train_X = bf_train[bf_train.columns.difference(["l이용자수"])]
bf_train_y = bf_train["l이용자수"]

In [None]:
from sklearn.linear_model import Ridge
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
alpha  = 1/np.array([0.01,0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000,10000])
ridge_params = {'alpha':alpha}
rmsle_scorer = metrics.make_scorer(rmsle,greater_is_better=False)
grid_ridge = GridSearchCV(ridge,ridge_params,scoring=rmsle_scorer,cv=5)
grid_ridge.fit(bf_train_X, bf_train_y)

pred = grid_ridge.predict(bf_train_X)


pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,bf_train_y,False)))
print("\n")

In [None]:
bf_test_X = bf_test[bf_test.columns.difference(["l이용자수"])]
bf_test_y = bf_test["l이용자수"]

pred = grid_ridge.predict(bf_test_X)
pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,bf_test_y,False)))
print("\n")

## Lasso

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso()
alpha  = 1/np.array([0.01,0.1, 1, 2, 3, 4, 10, 30,100,200,300,400])
lasso_params_ = {'alpha':alpha, 'tol': [0.001]}

grid_lasso = GridSearchCV( lasso,lasso_params_,scoring = rmsle_scorer,cv=5)
grid_lasso.fit(bf_train_X, bf_train_y)
pred1 = grid_lasso.predict(bf_train_X)
print("<Lasso hyperparameter>")
print (grid_lasso.best_params_)

pred = np.exp(pred1)-1

print('RMSLE Value For Lasso Regression: %.4f ' % rmsle(pred,bf_train_y,False))

In [None]:
pred = grid_lasso.predict(bf_test_X)
pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,bf_test_y,False)))
print("\n")

## Poisson GLM

In [None]:
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Poisson

glm = GLM(bf_train_y,bf_train_X,family=Poisson())
pois_glm = glm.fit()
pois_glm.summary()

In [None]:
pred = pois_glm.predict(bf_test_X)
pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,bf_test_y,False)))
print("\n")

# 2. 코로나 이후 - 단일모델

In [None]:
#train
import datetime
y2021["날짜"] = pd.to_datetime(y2021["날짜"])
criteria1 = datetime.datetime(2021,2,1)
y2021_1 = y2021[y2021["날짜"] < criteria1]
y2021_2 = y2021[y2021["날짜"] >=  criteria1]

y2020["날짜"] = pd.to_datetime(y2020["날짜"])
criteria2 = datetime.datetime(2020,2,1)
y2020 = y2020[y2020["날짜"] >= criteria2]



In [None]:
af_y_train = pd.concat([y2020,y2021_1])
af_train = pd.merge(af_y_train, subway_data, left_on="역명", right_on="station", how="left")

In [None]:
#train
af_train = af_train.drop("역명", axis=1)
af_train["이용자 수"] = af_train["이용자 수"].apply(lambda x: float(x))
af_train["날짜"] = af_train["날짜"].apply(lambda x: pd.to_datetime(x))

af_train["lagingindex"] = af_train["agingindex"].apply(lambda x:np.log(x))
af_train["l이용자수"] = af_train["이용자 수"].apply(lambda x:np.log1p(x))
af_train["lbank"] = af_train["bank"].apply(lambda x:np.log1p(x))
af_train = af_train.drop(["agingindex", "이용자 수", "bank"], axis=1)
af_train = pd.concat([af_train, pd.get_dummies(af_train["station"])], axis=1)
af_train["date"] = af_train["날짜"].apply(lambda x: int(x.strftime("%j")))
af_train = af_train.drop(["station","날짜"], axis=1)

In [None]:
#test
af_test = pd.merge(y2019,subway_data, left_on="역명", right_on="station", how="left")

af_test = af_test.drop("역명", axis=1)
af_test["이용자 수"] = af_test["이용자 수"].apply(lambda x: float(x))
af_test["날짜"] = af_test["날짜"].apply(lambda x: pd.to_datetime(x))

af_test["lagingindex"] = af_test["agingindex"].apply(lambda x:np.log(x))
af_test["l이용자수"] = af_test["이용자 수"].apply(lambda x:np.log(x))
af_test["lbank"] = af_test["bank"].apply(lambda x:np.log1p(x))
af_test = af_test.drop(["agingindex", "이용자 수", "bank"], axis=1)
af_test = pd.concat([af_test, pd.get_dummies(af_test["station"])], axis=1)
af_test["date"] = af_test["날짜"].apply(lambda x: int(x.strftime("%j")))
af_test = af_test.drop(["station","날짜"], axis=1)

In [None]:
af_train_X = af_train[af_train.columns.difference(["l이용자수"])]
af_train_y = af_train["l이용자수"]

In [None]:
bf_train_X.columns

## Ridge

In [None]:

ridge = Ridge()
alpha  = 1/np.array([0.01,0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000,10000])
ridge_params = {'alpha':alpha}
rmsle_scorer = metrics.make_scorer(rmsle,greater_is_better=False)
grid_ridge = GridSearchCV(ridge,ridge_params,scoring=rmsle_scorer,cv=5)
grid_ridge.fit(af_train_X, af_train_y)

pred = grid_ridge.predict(af_train_X)
print("<Ridge hyperparameter for after Covid-19>")
print(grid_ridge.best_params_)

pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,af_train_y,False)))
print("\n")

In [None]:
af_test_X = af_test[af_test.columns.difference(["l이용자수"])]
af_test_y = af_test["l이용자수"]

pred = grid_ridge.predict(af_test_X)
pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,af_test_y,False)))
print("\n")

## Lasso

In [None]:

lasso = Lasso()
alpha  = 1/np.array([0.01,0.1, 1, 2, 3, 4, 10, 30,100,200,300,400])
lasso_params_ = {'alpha':alpha, 'tol': [0.001]}

grid_lasso = GridSearchCV( lasso,lasso_params_,scoring = rmsle_scorer,cv=5)
grid_lasso.fit(af_train_X, af_train_y)
pred1 = grid_lasso.predict(af_train_X)
print("<Lasso hyperparameter for after covid-19>")
print (grid_lasso.best_params_)

pred = np.exp(pred1)-1

print('RMSLE Value For Lasso Regression: %.4f ' % rmsle(pred,af_train_y,False))

In [None]:
pred = grid_lasso.predict(af_test_X)
pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,af_test_y,False)))
print("\n")

## Poisson GLM

In [None]:
glm = GLM(af_train_y,af_train_X,family=Poisson())
pois_glm = glm.fit()
pois_glm.summary()

In [None]:
pred = pois_glm.predict(af_test_X)
pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,af_test_y,False)))
print("\n")

# 3. 군집분석

In [None]:
import hdbscan
hdb = hdbscan.HDBSCAN(min_cluster_size=2)

y_hdb = pd.Series(hdb.fit_predict(subway_data.iloc[:,1:]))
y_hdb.index = subway_data.index
y_hdb.name = "clustering"
subway_data= pd.concat([subway_data, y_hdb], axis=1)

## (1) 코로나 이전

In [None]:
bf_y_train = pd.concat([y2017,y2018])
bf_train = pd.merge(bf_y_train, subway_data, left_on="역명", right_on="station", how="left")

In [None]:
bf_train = bf_train.drop("역명", axis=1)
bf_train["이용자 수"] = bf_train["이용자 수"].apply(lambda x: float(x))
bf_train["날짜"] = bf_train["날짜"].apply(lambda x: pd.to_datetime(x))

bf_train["lagingindex"] = bf_train["agingindex"].apply(lambda x:np.log(x))
bf_train["l이용자수"] = bf_train["이용자 수"].apply(lambda x:np.log1p(x))
bf_train["lbank"] = bf_train["bank"].apply(lambda x:np.log1p(x))
bf_train = bf_train.drop(["agingindex", "이용자 수", "bank"], axis=1)
bf_train["date"] = bf_train["날짜"].apply(lambda x: int(x.strftime("%j")))
bf_train = bf_train.drop(["station","날짜"], axis=1)

In [None]:
bf_train_clustering = bf_train.groupby("clustering").mean().reset_index()
bf_train_clustering = pd.concat([bf_train_clustering, pd.get_dummies(bf_train_clustering["clustering"])], axis=1)
bf_train_clustering = bf_train_clustering.drop("clustering", axis=1)



In [None]:
#test
bf_test = pd.merge(y2019,subway_data, left_on="역명", right_on="station", how="left")
bf_test = bf_test.drop("역명", axis=1)
bf_test["이용자 수"] = bf_test["이용자 수"].apply(lambda x: float(x))
bf_test["날짜"] = bf_test["날짜"].apply(lambda x: pd.to_datetime(x))

bf_test["lagingindex"] = bf_test["agingindex"].apply(lambda x:np.log(x))
bf_test["l이용자수"] = bf_test["이용자 수"].apply(lambda x:np.log(x))
bf_test["lbank"] = bf_test["bank"].apply(lambda x:np.log1p(x))
bf_test = bf_test.drop(["agingindex", "이용자 수", "bank"], axis=1)
bf_test["date"] = bf_test["날짜"].apply(lambda x: int(x.strftime("%j")))
bf_test = bf_test.drop(["station","날짜"], axis=1)

In [None]:
bf_test_clustering = bf_test.groupby("clustering").mean().reset_index()
bf_test_clustering = pd.concat([bf_test_clustering, pd.get_dummies(bf_test_clustering["clustering"])], axis=1)
bf_test_clustering = bf_test_clustering.drop("clustering", axis=1)

In [None]:
bf_train_c_X = bf_train_clustering[bf_train_clustering.columns.difference(["l이용자수"])]
bf_train_c_y = bf_train_clustering["l이용자수"]

In [None]:
bf_test_c_X = bf_test_clustering[bf_train_clustering.columns.difference(["l이용자수"])]
bf_test_c_y = bf_test_clustering["l이용자수"]

## Ridge

In [None]:

ridge = Ridge()
alpha  = 1/np.array([0.01,0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000,10000])
ridge_params = {'alpha':alpha}
rmsle_scorer = metrics.make_scorer(rmsle,greater_is_better=False)
grid_ridge = GridSearchCV(ridge,ridge_params,scoring=rmsle_scorer,cv=2)
grid_ridge.fit(bf_train_c_X, bf_train_c_y)

pred = grid_ridge.predict(bf_train_c_X)
print("<Ridge hyperparameter>")
print(grid_ridge.best_params_)

pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,bf_train_c_y,False)))
print("\n")

In [None]:
pred = grid_ridge.predict(bf_test_c_X)
pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,bf_test_c_y,False)))
print("\n")

## Lasso

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso()
alpha  = 1/np.array([0.01,0.1, 1, 2, 3, 4, 10, 30,100,200,300,400])
lasso_params_ = {'alpha':alpha, 'tol': [0.001]}

grid_lasso = GridSearchCV( lasso,lasso_params_,scoring = rmsle_scorer,cv=2)
grid_lasso.fit(bf_train_c_X, bf_train_c_y)
pred1 = grid_lasso.predict(bf_train_c_X)
print("<Lasso hyperparameter>")
print (grid_lasso.best_params_)

pred = np.exp(pred1)-1

print('RMSLE Value For Lasso Regression: %.4f ' % rmsle(pred,bf_train_c_y,False))

In [None]:
pred = grid_lasso.predict(bf_test_c_X)
pred = np.exp(pred) -1
print('RMSLE Value For Ridge Regression : %.4f' % (rmsle(pred,bf_test_c_y,False)))
print("\n")