#Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("./dataset/train_data.csv",encoding = 'CP949')
test_data = pd.read_csv("./dataset/test_data.csv",encoding = 'CP949')

In [3]:
# testdata에 있는 columns 만 쓰기 위함
columns = test_data.columns

In [4]:
train_data = train_data[columns]

In [5]:
train_data.to_csv("./dataset/raw_train.csv", encoding="CP949", index = False)

In [6]:
# labelencoder를 위해 합침
merge = pd.concat([train_data, test_data])

merge.to_csv("./dataset/raw_merge.csv", encoding="CP949", index=False)

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
# 카테고리컬데이터은 레이블인코더로 바뀌고 테스트에 없는건 0으로 바뀜,
# 이를 위해 float데이터도 모두 1을 더해주고 없는 값만 0으로 해줌
# merge후 저장

objcol = [] # 레이블 갯수 확인용

for col in columns:
    if merge[col][0].dtype == object:
        le=LabelEncoder()
        y=merge[col].tolist()
        le.fit(y)
        merge[col]=le.transform(y)
        objcol.append(col)
    else:
        # float 데이터 처리
        y = merge[col].tolist()
        for i in range(len(y)):
            
            # nan이면 0으로
            if np.isnan(y[i]):
                y[i] = 0
            # 데이터 있으면 +1
            else:
                y[i] += 1
        merge[col] = y
        
merge.to_csv("./dataset/labeled_merge.csv", encoding="CP949", index = False)

In [9]:
## 오브젝트 레이블 갯수확인 (null값 빼고)- 모델 설계할 때 필요.
for i, obj in enumerate(objcol) :
    print(obj, ":", max(merge[obj].tolist()))

주야 : 2
요일 : 7
발생지시도 : 17
발생지시군구 : 208
사고유형_대분류 : 4
사고유형_중분류 : 19
법규위반 : 20
도로형태_대분류 : 9
도로형태 : 16
당사자종별_1당_대분류 : 12
당사자종별_2당_대분류 : 14


In [10]:
# 다시 분리 후 저장

train_data = merge[:train_data.shape[0]]
test_data = merge[train_data.shape[0]:]

train_data.to_csv("./dataset/labeled_train.csv", encoding="CP949", index = False)
test_data.to_csv("./dataset/labeled_test.csv", encoding="CP949", index = False)

#Build Model

In [11]:
from keras.layers import Input, Dense, concatenate
from keras.models import Model
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [12]:
## feature 갯수 지정
inputDim = test_data.shape[1]

## 학습을 위해 test셋의 빈칸 종류들을 넣기위함
## 전부 다 차있을 때 를 학습 시키기 위해 모두 1인 값을 넣어줌
valid_list = [[1]*inputDim]

## predict를 위해 만들어줌
test_masks = []

## 비어있는 곳은 0 차있는 곳은 1로 valid mask를 만들어서 넣어줌.

for i in range(test_data.shape[0]) :
    valid_mask = test_data.ix[i].tolist()
    
    for i, mask in enumerate(valid_mask):
        if not mask == 0 :
            valid_mask[i] = 1
    
    test_masks.append(valid_mask)
    
    if not valid_mask in valid_list :
        valid_list.append(valid_mask)
        
print("모두 1인 valid mask를 포함한 test data의 빈칸 뚤린 종류의 갯수:", len(valid_list))

모두 1인 valid mask를 포함한 test data의 빈칸 뚤린 종류의 갯수: 16


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [13]:
data_input = Input(shape=(inputDim,), name="data_input")
valid_input = Input(shape=(inputDim,), name="valid_input")

## data_input이랑 valid_input 합치기
x = concatenate([data_input,valid_input])

## AutoEncoder 랑 비슷하게 만듬
x = Dense(64,activation='tanh')(x)
x = Dense(128,activation='tanh')(x)
x = Dense(64,activation='tanh')(x)
x = Dense(32,activation='tanh')(x)

## 카테고리컬 아웃풋은 레이블 갯수만큼 맞춰줌 활성화함수도 맞춰서 계산
output_1 = Dense(2, activation = 'softmax' , name = 'output_1')(x)
output_2 = Dense(7, activation = 'softmax', name = 'output_2')(x)
output_3 = Dense(1, activation = 'linear' , name = 'output_3')(x)
output_4 = Dense(1, activation = 'linear' , name = 'output_4')(x)
output_5 = Dense(1, activation = 'linear' , name = 'output_5')(x)
output_6 = Dense(1, activation = 'linear' , name = 'output_6')(x)
output_7 = Dense(1, activation = 'linear' , name = 'output_7')(x)
output_8 = Dense(17, activation = 'softmax', name = 'output_8')(x)
output_9 = Dense(208, activation = 'softmax', name = 'output_9')(x)
output_10 = Dense(4, activation = 'softmax', name = 'output_10')(x)
output_11 = Dense(19, activation = 'softmax', name = 'output_11')(x)
output_12 = Dense(20, activation = 'softmax', name = 'output_12')(x)
output_13 = Dense(9, activation = 'softmax', name = 'output_13')(x)
output_14 = Dense(16, activation = 'softmax', name = 'output_14')(x)
output_15 = Dense(12, activation = 'softmax', name = 'output_15')(x)
output_16 = Dense(14, activation = 'softmax', name = 'output_16')(x)

In [14]:
## model 설정

model = Model(inputs=[data_input, valid_input],
              outputs=[output_1, output_2, output_3, output_4, output_5, output_6, output_7, output_8, 
                       output_9, output_10, output_11, output_12, output_13, output_14, output_15, output_16])

model.compile(optimizer='Adam',
              loss = {'output_1': 'binary_crossentropy',
                     'output_2' : 'categorical_crossentropy',
                     'output_3' : 'mean_squared_error', 
                     'output_4' : 'mean_squared_error', 
                     'output_5' : 'mean_squared_error', 
                     'output_6' : 'mean_squared_error', 
                     'output_7' : 'mean_squared_error', 
                     'output_8' : 'categorical_crossentropy', 
                     'output_9' : 'categorical_crossentropy', 
                     'output_10' : 'categorical_crossentropy', 
                     'output_11' : 'categorical_crossentropy', 
                     'output_12' : 'categorical_crossentropy', 
                     'output_13' : 'categorical_crossentropy', 
                     'output_14' : 'categorical_crossentropy', 
                     'output_15' : 'categorical_crossentropy',
                     'output_16' : 'categorical_crossentropy'})

#Make Validation Set

In [15]:
def make_val(train_data, val_size):

    val_idx = np.random.choice(range(len(train_data)), val_size, replace=False)
    val_data = train_data[val_idx]
    sub_train_data = np.delete(train_data, val_idx, 0)

    return sub_train_data, val_data

#Train

In [16]:
# training 위해서 array로 바꿔줌

train_data = train_data[train_data.columns[:]]
train_data = np.array(train_data)
test_data = test_data[test_data.columns[:]]
test_data = np.array(test_data)

valid_list = np.array(valid_list)

In [17]:
## train_data 다시 보정, 0값은 null값이므로 기존의 0 레이블을 없애줌

train_data -= [1]

In [None]:
## 원하는 epochs 입력
epochs = 10
sub_epochs = 3

for i in range(epochs) :
    print("epochs {}/{}  \n".format(i+1,epochs))
        
    for j, valid in enumerate(valid_list) :
        print("{}/{}. valid_mask:".format(j+1, len(valid_list)),valid,"Start!\n")

        for k in range(sub_epochs) :

            ## validation set 만듦

            sub_train_data, val_data = make_val(train_data, (len(train_data)*3)//10)

            ##mask 씌우는 작업
            masked_sub_train_data = sub_train_data * valid
            masked_val_data = val_data * valid


            ## 카테고리컬 함수만 one-hot encoding으로 변환

            train_cate_1 = to_categorical(sub_train_data[:,0], num_classes=max(train_data[:,0]).astype(int)+1)
            train_cate_2 = to_categorical(sub_train_data[:,1], num_classes=max(train_data[:,1]).astype(int)+1)
            train_cate_3 = to_categorical(sub_train_data[:,7], num_classes=max(train_data[:,7]).astype(int)+1)
            train_cate_4 = to_categorical(sub_train_data[:,8], num_classes=max(train_data[:,8]).astype(int)+1)
            train_cate_5 = to_categorical(sub_train_data[:,9], num_classes=max(train_data[:,9]).astype(int)+1)
            train_cate_6 = to_categorical(sub_train_data[:,10], num_classes=max(train_data[:,10]).astype(int)+1)
            train_cate_7 = to_categorical(sub_train_data[:,11], num_classes=max(train_data[:,11]).astype(int)+1)
            train_cate_8 = to_categorical(sub_train_data[:,12], num_classes=max(train_data[:,12]).astype(int)+1)
            train_cate_9 = to_categorical(sub_train_data[:,13], num_classes=max(train_data[:,13]).astype(int)+1)
            train_cate_10 = to_categorical(sub_train_data[:,14], num_classes=max(train_data[:,14]).astype(int)+1)
            train_cate_11 = to_categorical(sub_train_data[:,15], num_classes=max(train_data[:,15]).astype(int)+1)

            val_cate_1 = to_categorical(val_data[:,0], num_classes=max(train_data[:,0]).astype(int)+1)
            val_cate_2 = to_categorical(val_data[:,1], num_classes=max(train_data[:,1]).astype(int)+1)
            val_cate_3 = to_categorical(val_data[:,7], num_classes=max(train_data[:,7]).astype(int)+1)
            val_cate_4 = to_categorical(val_data[:,8], num_classes=max(train_data[:,8]).astype(int)+1)
            val_cate_5 = to_categorical(val_data[:,9], num_classes=max(train_data[:,9]).astype(int)+1)
            val_cate_6 = to_categorical(val_data[:,10], num_classes=max(train_data[:,10]).astype(int)+1)
            val_cate_7 = to_categorical(val_data[:,11], num_classes=max(train_data[:,11]).astype(int)+1)
            val_cate_8 = to_categorical(val_data[:,12], num_classes=max(train_data[:,12]).astype(int)+1)
            val_cate_9 = to_categorical(val_data[:,13], num_classes=max(train_data[:,13]).astype(int)+1)
            val_cate_10 = to_categorical(val_data[:,14], num_classes=max(train_data[:,14]).astype(int)+1)
            val_cate_11 = to_categorical(val_data[:,15], num_classes=max(train_data[:,15]).astype(int)+1)


            print("sub_epochs {}/{} Start!\n".format(k+1,sub_epochs))

            model.fit({'data_input' : masked_sub_train_data, 'valid_input' : np.array([valid]*len(masked_sub_train_data))},
                     {'output_1': train_cate_1, 
                     'output_2': train_cate_2, 
                     'output_3': sub_train_data[:,2], 
                     'output_4': sub_train_data[:,3], 
                     'output_5': sub_train_data[:,4], 
                     'output_6': sub_train_data[:,5], 
                     'output_7': sub_train_data[:,6], 
                     'output_8': train_cate_3, 
                     'output_9': train_cate_4, 
                     'output_10': train_cate_5, 
                     'output_11': train_cate_6, 
                     'output_12': train_cate_7, 
                     'output_13': train_cate_8, 
                     'output_14': train_cate_9, 
                     'output_15': train_cate_10,
                     'output_16': train_cate_11 },
                      batch_size=32)


            ## validation 평가

            val_result = model.predict({'data_input' : masked_val_data, 'valid_input' : np.array([valid]*len(masked_val_data))})

            numerical_score = 0
            categorical_score = 0

            ## 평가하기위해 결과 쪼갬 및 one hot decoding

            result1 = np.array([[np.argmax(res)] for res in val_result[0]])
            result2 = np.array([[np.argmax(res)] for res in val_result[1]])
            result3 = val_result[2]
            result4 = val_result[3]
            result5 = val_result[4]
            result6 = val_result[5]
            result7 = val_result[6]
            result8 = np.array([[np.argmax(res)] for res in val_result[7]])
            result9 = np.array([[np.argmax(res)] for res in val_result[8]])
            result10 = np.array([[np.argmax(res)] for res in val_result[9]])
            result11 = np.array([[np.argmax(res)] for res in val_result[10]])
            result12 = np.array([[np.argmax(res)] for res in val_result[11]])
            result13 = np.array([[np.argmax(res)] for res in val_result[12]])
            result14 = np.array([[np.argmax(res)] for res in val_result[13]])
            result15 = np.array([[np.argmax(res)] for res in val_result[14]])
            result16 = np.array([[np.argmax(res)] for res in val_result[15]])

            ## numerical scoring

            numerical_score += np.exp(0-np.mean(((result3 - val_data[:,2])/ 1)**2))
            numerical_score += np.exp(0-np.mean(((result4 - val_data[:,3])/ 1)**2))
            numerical_score += np.exp(0-np.mean(((result5 - val_data[:,4])/ 1)**2))
            numerical_score += np.exp(0-np.mean(((result6 - val_data[:,5])/ 1)**2))
            numerical_score += np.exp(0-np.mean(((result7 - val_data[:,6])/ 1)**2))


            ## categorical scoring

            categorical_score += np.mean((result1 == val_cate_1).astype(float))
            categorical_score += np.mean((result2 == val_cate_2).astype(float))
            categorical_score += np.mean((result8 == val_cate_3).astype(float))
            categorical_score += np.mean((result9 == val_cate_4).astype(float))
            categorical_score += np.mean((result10 == val_cate_5).astype(float))
            categorical_score += np.mean((result11 == val_cate_6).astype(float))
            categorical_score += np.mean((result12 == val_cate_7).astype(float))
            categorical_score += np.mean((result13 == val_cate_8).astype(float))
            categorical_score += np.mean((result14 == val_cate_9).astype(float))
            categorical_score += np.mean((result15 == val_cate_10).astype(float))
            categorical_score += np.mean((result16 == val_cate_11).astype(float))


            print('\n Validation Accuracy - Numerical: {} Categorical: {}, Total: {}\n'
                  .format(numerical_score/5, categorical_score/11, (numerical_score + categorical_score)/16))
        
    
print('Finished!')

epochs 1/10  

1/16. valid_mask: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] Start!

sub_epochs 1/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.4256414978951398 Categorical: 0.0740666172425083, Total: 0.18393376744645562
sub_epochs 2/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.3700583137151255 Categorical: 0.0755871550027325, Total: 0.1676093921003553
sub_epochs 3/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.42220630255014313 Categorical: 0.0702823413441236, Total: 0.18025857922100472
2/16. valid_mask: [1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] Start!

sub_epochs 1/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.3193303480532103 Categorical: 0.0728565195904945, Total: 0.14987959098509318
sub_epochs 2/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.4478552498418966 Categorical: 0.0768209052573615, Total: 0.19276913794002873
sub_epochs 3/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.42776283429633716 Categorica


 Validation Accuracy - Numerical: 0.4517911281845056 Categorical: 0.07198659886553556, Total: 0.1906755142777137
sub_epochs 2/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.47086422529581534 Categorical: 0.07161687892406676, Total: 0.1963816746652382
sub_epochs 3/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.43873054466284317 Categorical: 0.07254270375657769, Total: 0.18697640403978566
6/16. valid_mask: [1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1.] Start!

sub_epochs 1/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.4278550017234835 Categorical: 0.07241884361712331, Total: 0.18349264302536086
sub_epochs 2/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.34492131084678324 Categorical: 0.07291411283999245, Total: 0.15791636221711458
sub_epochs 3/3 Start!

Epoch 1/1

 Validation Accuracy - Numerical: 0.35088444566077726 Categorical: 0.0739628597025004, Total: 0.1605008553144619
7/16. valid_mask: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1.] S

#Predict

In [None]:
result = model.predict({'data_input' : test_data, 'valid_input' : np.array(test_masks)})

#Make result file

In [None]:
## +[1] 은 label decoder를 위해 해줌

result1 = np.array([[np.argmax(res)] for res in result[0]]) + [1]
result2 = np.array([[np.argmax(res)] for res in result[1]]) + [1]
result3 = result[2]
result4 = result[3]
result5 = result[4]
result6 = result[5]
result7 = result[6]
result8 = np.array([[np.argmax(res)] for res in result[7]]) + [1]
result9 = np.array([[np.argmax(res)] for res in result[8]]) + [1]
result10 = np.array([[np.argmax(res)] for res in result[9]]) + [1]
result11 = np.array([[np.argmax(res)] for res in result[10]]) + [1]
result12 = np.array([[np.argmax(res)] for res in result[11]]) + [1]
result13 = np.array([[np.argmax(res)] for res in result[12]]) + [1]
result14 = np.array([[np.argmax(res)] for res in result[13]]) + [1]
result15 = np.array([[np.argmax(res)] for res in result[14]]) + [1]
result16 = np.array([[np.argmax(res)] for res in result[15]]) + [1]

In [21]:
## column들 다시 합쳐주기

result = np.hstack((result1, result2, result3, result4, result5, result6, result7, result8,
         result9, result10, result11, result12, result13, result14, result15, result16))

In [22]:
## 역으로 되돌려 주기 위함
merge = pd.read_csv("./dataset/raw_merge.csv", encoding='CP949')

decoded_result = []
for i, col in enumerate(columns):
    if type(merge[col][0]) == str:
        le=LabelEncoder()
        y=merge[col].tolist()
        le.fit(y)
        z = le.inverse_transform(result[:,i].astype(int))
        z = z.reshape(-1,1)
        ## decoded_result 에 아무것도 없으면 hstack 하는데 에러남
        if not len(decoded_result):
            decoded_result = z
        else:
            decoded_result = np.hstack((decoded_result,z))
        
    else:
        z = result[:,i]
        z = z.reshape(-1,1)
        decoded_result = np.hstack((decoded_result,z))

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [23]:
## decoded_result file을 data frame으로 바꿔서 csv파일로 저장

decoded_result_df = {columns[0] : decoded_result[:,0], 
                     columns[1] : decoded_result[:,1], 
                     columns[2] : decoded_result[:,2], 
                     columns[3] : decoded_result[:,3], 
                     columns[4] : decoded_result[:,4], 
                     columns[5] : decoded_result[:,5], 
                     columns[6] : decoded_result[:,6], 
                     columns[7] : decoded_result[:,7], 
                     columns[8] : decoded_result[:,8], 
                     columns[9] : decoded_result[:,9], 
                     columns[10] : decoded_result[:,10], 
                     columns[11] : decoded_result[:,11], 
                     columns[12] : decoded_result[:,12], 
                     columns[13] : decoded_result[:,13], 
                     columns[14] : decoded_result[:,14], 
                     columns[15] : decoded_result[:,15], }

decoded_result_df = pd.DataFrame(decoded_result_df, columns = columns)

decoded_result_df.to_csv("./dataset/decoded_result.csv", encoding="CP949", index = False)

In [24]:
result_kor = pd.read_csv("./result_kor.csv",encoding = 'CP949')
result_kor_HH = pd.read_csv("./result_kor.csv",encoding = 'CP949')

In [25]:
for col in ["행","열"]:
    if type(result_kor[col][0]) == str:
        # 열 데이터 처리
        le=LabelEncoder()
        y=result_kor[col].tolist()
        le.fit(y)
        result_kor[col]=le.transform(y)
    else:
        # 행 데이터 처리
        y = result_kor[col].tolist()
        for i in range(len(y)):
            #list 위치에 대한 보정값
            y[i] -= 2
            
        result_kor[col] = y

In [26]:
for i in range(result_kor.shape[0]):
    result_kor_HH["값"][i] = decoded_result[result_kor["행"][i],result_kor["열"][i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [27]:
result_kor_HH.to_csv("./result_kor_HH.csv", encoding="CP949", index = False)