# 필요 라이브러리 import

In [17]:
# Environment
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import tensorflow_addons as tfa
import tensorflow as tf
import pandas as pd
from tensorflow import feature_column
from sklearn.model_selection import train_test_split
#from keras import backend as K
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import recall_score, precision_score, f1_score
import logging
tf.get_logger().setLevel(logging.ERROR)
import sys
np.set_printoptions(threshold=sys.maxsize)
from tensorflow import keras

In [18]:
METRICS = tfa.metrics.F1Score(
	num_classes=6,
	average='micro',
	name='f1_score',
	threshold=0.5
)

In [19]:
df=pd.read_csv('/Users/keumhyeonjun/Projects/kisti_renewal/dataset_final/Dataset_4_2016.csv')
df_sample = pd.read_csv('/Users/keumhyeonjun/Projects/kisti_renewal/dataset_final/testdata_4_2016.csv')

In [20]:
NUMERIC_COLUMN = [
    "Year",
    "Log_RnD_Fund",
    "Log_Duration",
    "N_of_SCI",
    "N_of_Paper",
    "N_Patent_App",
    "N_Patent_Reg",
    "N_of_Korean_Patent",
    "STP_Code_1_Weight",
    "STP_Code_2_Weight",
    "Application_Area_1_Weight",
    "Application_Area_2_Weight",
]

In [21]:
CATEGORICAL_COLUMN = [
    "Multi_Year",
    "RnD_Org",
    "STP_Code_11",
    "STP_Code_21",
    "Application_Area_1",
    "Application_Area_2",
    "Green_Tech",
    "SixT_2",
    "Econ_Social",
    "National_Strategy_2",
    "RnD_Stage",
    "Cowork_Cor",
    "Cowork_Uni",
    "Cowork_Inst",
    "Cowork_Abroad",
    "Cowork_etc",
]
LABEL_COLUMN = [
    "Comm_Success",
    "Comm_Success_1",
    "Comm_Success_2",
    "Comm_Success_Code1_4",
    "Comm_Success_Code2_5",
    "Comm_Success_Code3_6",
]

In [22]:
feature_columns = []
for header in NUMERIC_COLUMN:
	feature_columns.append(feature_column.numeric_column(header, dtype=tf.dtypes.float64))
for header in CATEGORICAL_COLUMN:
	vocab = feature_column.categorical_column_with_vocabulary_list(header, df[header].unique())
	feature_columns.append(feature_column.indicator_column(vocab))

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

## Data Split & shuffle
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.25)

x_train = dict(train_df[NUMERIC_COLUMN + CATEGORICAL_COLUMN])
x_test = dict(test_df[NUMERIC_COLUMN + CATEGORICAL_COLUMN])
x_val = dict(val_df[NUMERIC_COLUMN + CATEGORICAL_COLUMN])
x_sample = dict(df_sample[NUMERIC_COLUMN + CATEGORICAL_COLUMN])

y_train = train_df[LABEL_COLUMN]
y_test = test_df[LABEL_COLUMN]
y_val = val_df[LABEL_COLUMN]
y_sample = df_sample[LABEL_COLUMN]

In [23]:
## EarlyStopping Condition
cb = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    min_delta = 0,
    patience=50,
    verbose=1,
    mode='min',
    baseline=None,
    restore_best_weights=True
    )

In [24]:
## Hyper-Parameter
Batch_Size = 256
Dim= 256
Dropout = 0.2
Alpha=[0.825, 0.9, 0.95, 0.99, 0.90,0.925 ]
Epochs= 200

In [25]:
model = tf.keras.models.Sequential([
        feature_layer,
        tf.keras.layers.Dense(units=Dim, activation='relu'),
        tf.keras.layers.Dropout(rate=Dropout),
        tf.keras.layers.Dense(units=Dim, activation='relu'),
        tf.keras.layers.Dropout(rate=Dropout),
        tf.keras.layers.Dense(units=Dim, activation='relu'),
        tf.keras.layers.Dropout(rate=Dropout),
        tf.keras.layers.Dense(units=y_train.shape[1], activation='sigmoid')
    ])

In [26]:
model.compile(optimizer='adam',
                  loss= tfa.losses.SigmoidFocalCrossEntropy(gamma = 2, alpha=Alpha),
                  metrics=METRICS
    )

model.fit(
        x_train, y_train,
        batch_size=Batch_Size,
        epochs=Epochs,
        verbose=2,
        validation_data=(x_val, y_val),
        callbacks=cb
    )

## Save model
#model.save('Project_Main')

Epoch 1/200
453/453 - 16s - loss: 5.3915 - f1_score: 0.0240 - val_loss: 5.4417 - val_f1_score: 0.0189 - 16s/epoch - 36ms/step
Epoch 2/200
453/453 - 13s - loss: 5.3890 - f1_score: 0.0190 - val_loss: 5.4417 - val_f1_score: 0.0189 - 13s/epoch - 29ms/step
Epoch 3/200
453/453 - 14s - loss: 5.3901 - f1_score: 0.0190 - val_loss: 5.4417 - val_f1_score: 0.0189 - 14s/epoch - 30ms/step
Epoch 4/200
453/453 - 14s - loss: 5.3896 - f1_score: 0.0190 - val_loss: 5.4417 - val_f1_score: 0.0189 - 14s/epoch - 30ms/step
Epoch 5/200
453/453 - 14s - loss: 5.3895 - f1_score: 0.0190 - val_loss: 5.4417 - val_f1_score: 0.0189 - 14s/epoch - 30ms/step
Epoch 6/200
453/453 - 14s - loss: 5.3895 - f1_score: 0.0190 - val_loss: 5.4417 - val_f1_score: 0.0189 - 14s/epoch - 31ms/step
Epoch 7/200
453/453 - 14s - loss: 5.3889 - f1_score: 0.0190 - val_loss: 5.4417 - val_f1_score: 0.0189 - 14s/epoch - 30ms/step
Epoch 8/200
453/453 - 14s - loss: 5.3885 - f1_score: 0.0190 - val_loss: 5.4417 - val_f1_score: 0.0189 - 14s/epoch - 30

KeyboardInterrupt: 

In [None]:
y_true = y_test
y_pred = lgbm_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
df = pd.DataFrame({'y_true':y_true, 'y_pred':y_pred})
cor = df['y_true'].corr(df['y_pred'])