In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder, RobustScaler
import random

In [25]:
src_path = './Kyoto/kyoto_processed/monthly/subset'
target_path = './Kyoto/kyoto_processed/monthly/onehot'
years = ["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]
months = ['01','02','03','04','05','06','07','08','09','10','11','12']

categorical_cols = ["0", "1", "2", "3", "13", "19"]
numerical_cols = ["4", "5", "6", "7", "8", "9", "10", "11", "12"]
additional_cols = ["14", "15", "16", "18"]
label_col = ["17"]

In [26]:
all_files = os.listdir(src_path)
all_files.sort()
# all_files = all_files[37:39]
year_months = [item[0:7] for item in all_files]
year_months

['2006_11',
 '2006_12',
 '2007_01',
 '2007_02',
 '2007_03',
 '2007_04',
 '2007_05',
 '2007_06',
 '2007_07',
 '2007_08',
 '2007_09',
 '2007_10',
 '2007_11',
 '2007_12',
 '2008_01',
 '2008_02',
 '2008_03',
 '2008_04',
 '2008_05',
 '2008_06',
 '2008_07',
 '2008_08',
 '2008_09',
 '2008_10',
 '2008_11',
 '2008_12',
 '2009_01',
 '2009_02',
 '2009_03',
 '2009_04',
 '2009_05',
 '2009_06',
 '2009_07',
 '2009_08',
 '2009_09',
 '2009_10',
 '2009_11',
 '2009_12',
 '2010_01',
 '2010_02',
 '2010_03',
 '2010_04',
 '2010_05',
 '2010_06',
 '2010_07',
 '2010_08',
 '2010_09',
 '2010_10',
 '2010_11',
 '2010_12',
 '2011_01',
 '2011_02',
 '2011_03',
 '2011_04',
 '2011_05',
 '2011_06',
 '2011_07',
 '2011_08',
 '2011_09',
 '2011_10',
 '2011_11',
 '2011_12',
 '2012_01',
 '2012_02',
 '2012_03',
 '2012_04',
 '2012_05',
 '2012_06',
 '2012_07',
 '2012_08',
 '2012_09',
 '2012_10',
 '2012_11',
 '2012_12',
 '2013_01',
 '2013_02',
 '2013_03',
 '2013_04',
 '2013_05',
 '2013_06',
 '2013_07',
 '2013_08',
 '2013_09',
 '20

In [27]:
def rename_columns(df):
    new_names = []
    for col_name in df.columns.astype(str).values:
        if col_name in numerical_cols:
            df[col_name] = pd.to_numeric(df[col_name])
            new_names.append((col_name, "num_" + col_name))
        elif col_name in categorical_cols:
            new_names.append((col_name, "cat_" + col_name))
        elif col_name in additional_cols:
            new_names.append((col_name, "bonus_" + col_name))
        elif col_name in label_col:
            df[col_name] = pd.to_numeric(df[col_name])
            new_names.append((col_name, "label"))
        else:
            new_names.append((col_name, col_name))
    df.rename(columns=dict(new_names), inplace=True)
    return df

def sample_abnormal_concate(df, anomaly_ratio):
    df_normal = df[df['label']==1]
    df_normal = df_normal.reset_index(drop=True)
    df_abnormal = df[df['label']!=1]
    df_abnormal = df_abnormal.reset_index(drop=True)
    random.seed(df.shape[0])
    df_abnormal_indexes = random.sample(list(range(df_abnormal.shape[0])),int(df_normal.shape[0]*anomaly_ratio))
    df_abnormal_indexes.sort()
    df_abnormal = df_abnormal.iloc[df_abnormal_indexes]
    df = pd.concat([df_normal,df_abnormal],axis=0).reset_index(drop=True)
    return df

def train_test_split(df, test_ratio=0.2, train_labeled_anomaly_ratio=0.01, assign_normal_num=None):
    if assign_normal_num:
        normal_num = (df["label"]==1).sum()
        if normal_num<=assign_normal_num:
            pass
        else:
            assign_indexes = random.sample(list(range(df.shape[0])),int(df.shape[0]*assign_normal_num/normal_num))
            assign_indexes.sort()
            df = df.iloc[assign_indexes]
            df = df.reset_index(drop=True)
        
    all_indexes = list(range(df.shape[0]))
    test_indexes = random.sample(all_indexes,int(df.shape[0]*test_ratio))
    test_indexes.sort()
    train_indexes = list(set(all_indexes)-set(test_indexes))
    train_indexes.sort()
    train_data = df.iloc[train_indexes]
    train_data = sample_abnormal_concate(train_data,anomaly_ratio=train_labeled_anomaly_ratio)
    test_data = df.iloc[test_indexes]
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    return train_data,test_data

def preprocess(df, enc):
    num_cat_features = enc.transform(df.loc[:, ['cat_' in i for i in df.columns]]).toarray()

    df_catnum = pd.DataFrame(num_cat_features)
    df_catnum = df_catnum.add_prefix('catnum_')

    df.reset_index(drop=True)
    df_new = pd.concat([df, df_catnum], axis=1)

    filter_clear = df_new["label"] == 1
    filter_infected = df_new["label"] < 0
    df_new.loc[filter_clear,"label"] = 0
    df_new.loc[filter_infected,"label"] = 1

    return df_new

class MyDataset():
    def __init__(self, df_ym, one_enc):
        df_ym = preprocess(df_ym, one_enc)
        numerical_cols = df_ym.columns.to_numpy()[['num_' in i for i in df_ym.columns]]
#         print(numerical_cols)
        self.columns = numerical_cols
        self.x = df_ym[numerical_cols].values
        self.y = df_ym["label"].values

In [28]:
cats = []
df_train_yms = []
df_test_yms = []
for i,ym in enumerate(year_months):
    print(ym)
    path = os.path.join(src_path,all_files[i])
    df_ym =  pd.read_parquet(path)
    df_ym = df_ym.reset_index(drop=True)
    df_ym = rename_columns(df_ym)

    df_ym = sample_abnormal_concate(df_ym, anomaly_ratio=0.1)
    df_train_ym, df_test_ym = train_test_split(df_ym, test_ratio=0.2, train_labeled_anomaly_ratio=0.01,assign_normal_num=10000)
    cat = df_train_ym.loc[:,['cat_' in i for i in df_train_ym.columns]]
    
    df_train_yms.append(df_train_ym)
    df_test_yms.append(df_test_ym)
    cats.append(cat)
cats = pd.concat(cats, axis=0)

2006_11
2006_12
2007_01
2007_02
2007_03
2007_04
2007_05
2007_06
2007_07
2007_08
2007_09
2007_10
2007_11
2007_12
2008_01
2008_02
2008_03
2008_04
2008_05
2008_06
2008_07
2008_08
2008_09
2008_10
2008_11
2008_12
2009_01
2009_02
2009_03
2009_04
2009_05
2009_06
2009_07
2009_08
2009_09
2009_10
2009_11
2009_12
2010_01
2010_02
2010_03
2010_04
2010_05
2010_06
2010_07
2010_08
2010_09
2010_10
2010_11
2010_12
2011_01
2011_02
2011_03
2011_04
2011_05
2011_06
2011_07
2011_08
2011_09
2011_10
2011_11
2011_12
2012_01
2012_02
2012_03
2012_04
2012_05
2012_06
2012_07
2012_08
2012_09
2012_10
2012_11
2012_12
2013_01
2013_02
2013_03
2013_04
2013_05
2013_06
2013_07
2013_08
2013_09
2013_10
2013_11
2013_12
2014_01
2014_02
2014_03
2014_04
2014_05
2014_06
2014_07
2014_08
2014_09
2014_10
2014_11
2014_12
2015_01
2015_02
2015_03
2015_04
2015_05
2015_06
2015_07
2015_08
2015_09
2015_10
2015_11
2015_12


In [29]:
length = [len(list(set(list(cats.values[:,i])))) for i in range(cats.shape[1])]
print(length)
print(sum(length))
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(cats.values)

[101, 14, 162, 145, 13, 3]
438


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [30]:
from copy import deepcopy
scaler = RobustScaler()
train_datasets = []
test_datasets = []
for_scaler = []

for i, ym in enumerate(year_months):
    print(ym)
    df_train_ym = df_train_yms[i]
    ds_train_ym = MyDataset(df_train_ym, enc)
    for_scaler.append(deepcopy(ds_train_ym.x)[:,0:9])
    train_datasets.append(ds_train_ym)
    
    df_test_ym = df_test_yms[i]
    ds_test_ym = MyDataset(df_test_ym, enc)
    test_datasets.append(ds_test_ym)


scaler.fit(np.concatenate(for_scaler, axis=0))
print('Fit a Robust Scaler')

for i, ym in enumerate(year_months):
    train_datasets[i].x[:,0:9] = scaler.transform(train_datasets[i].x[:,0:9])
    test_datasets[i].x[:,0:9] = scaler.transform(test_datasets[i].x[:,0:9])

2006_11
2006_12
2007_01
2007_02
2007_03
2007_04
2007_05
2007_06
2007_07
2007_08
2007_09
2007_10
2007_11
2007_12
2008_01
2008_02
2008_03
2008_04
2008_05
2008_06
2008_07
2008_08
2008_09
2008_10
2008_11
2008_12
2009_01
2009_02
2009_03
2009_04
2009_05
2009_06
2009_07
2009_08
2009_09
2009_10
2009_11
2009_12
2010_01
2010_02
2010_03
2010_04
2010_05
2010_06
2010_07
2010_08
2010_09
2010_10
2010_11
2010_12
2011_01
2011_02
2011_03
2011_04
2011_05
2011_06
2011_07
2011_08
2011_09
2011_10
2011_11
2011_12
2012_01
2012_02
2012_03
2012_04
2012_05
2012_06
2012_07
2012_08
2012_09
2012_10
2012_11
2012_12
2013_01
2013_02
2013_03
2013_04
2013_05
2013_06
2013_07
2013_08
2013_09
2013_10
2013_11
2013_12
2014_01
2014_02
2014_03
2014_04
2014_05
2014_06
2014_07
2014_08
2014_09
2014_10
2014_11
2014_12
2015_01
2015_02
2015_03
2015_04
2015_05
2015_06
2015_07
2015_08
2015_09
2015_10
2015_11
2015_12
Fit a Robust Scaler


In [31]:
columns = list(train_datasets[0].columns)
columns.append('label')

for i, ym in enumerate(year_months):
    train_ym = np.concatenate([train_datasets[i].x,train_datasets[i].y.reshape(-1,1)],axis=1)
    train_ym = pd.DataFrame(train_ym,columns=columns)
    test_ym = np.concatenate([test_datasets[i].x,test_datasets[i].y.reshape(-1,1)],axis=1)
    test_ym = pd.DataFrame(test_ym,columns=columns)
    train_ym.to_parquet(os.path.join(target_path,'{}_train_subset_onehot.parquet'.format(ym)))
    test_ym.to_parquet(os.path.join(target_path,'{}_test_subset_onehot.parquet'.format(ym)))

In [32]:
train_ym.shape, test_ym.shape

((8074, 448), (2199, 448))