In [187]:
import pandas as pd
import os
from functools import reduce
from sklearn.model_selection import train_test_split

## 데이터 합치기

In [188]:
## 온도 데이터 위치
root_path = '../dataset'
save_path = 'rain_total.csv'
assert os.path.isdir(root_path), "온도 데이터가 있는 폴더 설정 하세요."

In [189]:
## 데이터 확인
filelist = [item for item in os.listdir(root_path) if '.csv' in item or '.xlsx' in item]
print(filelist)

['in_total.csv', 'dust_total.csv', 'temp_total.csv', 'rain_total.csv', 'holiday_total.csv', 'user_total.csv']


In [190]:
demand_df = pd.read_csv(os.path.join(root_path,'in_total.csv'), thousands=',')
rain_df = pd.read_csv(os.path.join(root_path,'rain_total.csv'))
dust_df = pd.read_csv(os.path.join(root_path,'dust_total.csv'))
temp_df = pd.read_csv(os.path.join(root_path,'temp_total.csv'))
holiday_df = pd.read_csv(os.path.join(root_path,'holiday_total.csv'))
user_df = pd.read_csv(os.path.join(root_path,'user_total.csv'))

In [192]:
## 전처리
print(demand_df.isna().sum())
print(rain_df.isna().sum())
print(dust_df.isna().sum())
print(temp_df.isna().sum())
print(holiday_df.isna().sum())
print(user_df.isna().sum())

date     0
count    0
dtype: int64
date       0
rain    1370
dtype: int64
date    0
dust    0
dtype: int64
date     0
temp    34
dtype: int64
date       0
holiday    0
dtype: int64
date    0
user    0
dtype: int64


In [193]:
## rain 전처리 모두 0으로 처리
rain_df = rain_df.fillna(0)

In [194]:
## 온도는 없는것 없애기
temp_df=temp_df.dropna()

In [195]:
## 데이터 합치기
data_frames = [demand_df, rain_df, dust_df, temp_df, holiday_df, user_df]
#data_frames = [demand_df, rain_df, dust_df, temp_df, holiday_df]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['date']), data_frames)

In [196]:
##중복 제거
df_merged = df_merged.drop_duplicates()
## 정렬하기
df_merged = df_merged.sort_values('date')

In [197]:
## 더미변수 생성하기
df_merged['date'] = pd.to_datetime(df_merged['date'], format='%Y-%m-%d')
df_merged['week']=df_merged.date.dt.weekday

In [198]:
df_merged['shift1'] = df_merged['count'].shift(1)
df_merged = df_merged.dropna()

In [199]:
## 데이터 저장하기
df_merged.to_csv('svr_dataset.csv', index=None)

In [200]:
## 전체 데이터
print(len(df_merged))

1207


In [201]:
df_merged

Unnamed: 0,date,count,rain,dust,temp,holiday,user,week,shift1
1,2017-01-02,3796,0.3,113.974359,5.0,0.0,182,0,2307.0
2,2017-01-03,4049,0.0,80.948718,2.0,0.0,198,1,3796.0
3,2017-01-04,4646,0.0,58.358974,3.9,0.0,189,2,4049.0
4,2017-01-05,4500,0.0,38.769231,3.8,0.0,190,3,4646.0
5,2017-01-06,4944,0.0,23.743590,5.4,0.0,241,4,4500.0
...,...,...,...,...,...,...,...,...,...
1350,2020-06-26,96150,2.1,22.653061,21.8,0.0,3602,4,46415.0
1351,2020-06-27,107001,0.0,26.489796,24.1,0.0,7049,5,96150.0
1352,2020-06-28,98568,0.0,29.653061,25.2,0.0,6982,6,107001.0
1353,2020-06-29,70053,11.9,36.469388,23.8,0.0,2609,0,98568.0


## noramlize

In [202]:
df_norm = pd.DataFrame()

In [203]:
df_norm['mean'] = df_merged.mean()
df_norm['std'] = df_merged.std()

  df_norm['mean'] = df_merged.mean()


In [204]:
df_norm = df_norm.reset_index().rename(columns={'index':'var'})

In [205]:
df_norm.to_csv('norm.csv', index=None)

In [206]:
df_norm

Unnamed: 0,var,mean,std
0,count,35299.103563,28021.377837
1,rain,3.023695,11.828668
2,dust,43.122649,23.050127
3,temp,12.695443,10.737454
4,holiday,0.04971,0.217435
5,user,1801.076222,1698.879689
6,week,2.97763,2.004637
7,shift1,35269.460646,28037.347402


In [207]:
df_norm[df_norm['var']=='week']['mean']

6    2.97763
Name: mean, dtype: float64

## 데이터 나누기

In [208]:
## 저장폴더
data_path = 'resource'
test_portion = 0.1

In [209]:
## 저장 폴더 만들기
os.makedirs(data_path, exist_ok = True)
df_train, df_test = train_test_split(df_merged, test_size=test_portion, shuffle=False)

In [210]:
len(df_train), len(df_test)

(1086, 121)

In [211]:
df_train.to_csv(os.path.join(data_path, 'train.csv'), index=None)
df_test.to_csv(os.path.join(data_path, 'test.csv'), index=None)

In [212]:
selected_column=[]
for var_name in df_train.columns.tolist():
    if 'date' not in var_name and 'count' not in var_name:
        df_train[var_name] = pd.to_numeric(df_train[var_name], errors='coerce')
        selected_column.append(var_name)

        if df_norm is not None:
            df_train[var_name] = (df_train[var_name]-df_norm[df_norm['var']==var_name]['mean'])/df_norm[df_norm['var']==var_name]['std']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[var_name] = pd.to_numeric(df_train[var_name], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[var_name] = (df_train[var_name]-df_norm[df_norm['var']==var_name]['mean'])/df_norm[df_norm['var']==var_name]['std']


In [213]:
df_norm[df_norm['var']=='rain']['mean'].values

array([3.02369511])

## Scaler로 만들기

In [214]:
from sklearn.preprocessing import MinMaxScaler

In [215]:
df_merged

Unnamed: 0,date,count,rain,dust,temp,holiday,user,week,shift1
1,2017-01-02,3796,0.3,113.974359,5.0,0.0,182,0,2307.0
2,2017-01-03,4049,0.0,80.948718,2.0,0.0,198,1,3796.0
3,2017-01-04,4646,0.0,58.358974,3.9,0.0,189,2,4049.0
4,2017-01-05,4500,0.0,38.769231,3.8,0.0,190,3,4646.0
5,2017-01-06,4944,0.0,23.743590,5.4,0.0,241,4,4500.0
...,...,...,...,...,...,...,...,...,...
1350,2020-06-26,96150,2.1,22.653061,21.8,0.0,3602,4,46415.0
1351,2020-06-27,107001,0.0,26.489796,24.1,0.0,7049,5,96150.0
1352,2020-06-28,98568,0.0,29.653061,25.2,0.0,6982,6,107001.0
1353,2020-06-29,70053,11.9,36.469388,23.8,0.0,2609,0,98568.0
