In [67]:
import os
import sys
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
sys.path.append(os.pardir)
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from models.word_embedding import get_embedding_matrix
import pandas as pd

In [2]:
w2v_model_name = '../model_save/embedding_model/Word2vec1.model'
word_vectorizer = Word2Vec.load(w2v_model_name)

pre_trained_name = '../model_save/embedding_model/trained_word2vec1.model'
pre_trained_w2v = Word2Vec.load(pre_trained_name)

In [3]:
DATA_IN_PATH = '../assets/data/npy_data/2020-05-31/'
LABEL_IN_PATH = '../assets/label_data/'

DATA_CONFIGS = 'data_configs.json'
# Train label save file name
TRAIN_INPUT_DATA = 'train_input.npy'
LABEL_DATA = 'label.json'
LABEL_DATA_SMALL = 'label_small.json'
TEST_DATA = './application_data/professor.npy'

In [4]:
train_X = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
data_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))
label_data = json.load(open(LABEL_IN_PATH + LABEL_DATA, 'r'))
small_label_data = json.load(open(LABEL_IN_PATH + LABEL_DATA_SMALL, 'r'))
vocab_size = data_configs['vocab_size'] + 1
print("vocab_size : ", vocab_size)
word_index = data_configs['vocab']

vocab_size :  30079


In [5]:
cnn_kargs = {'vocab_size': vocab_size,
             'embedding_size': 300, 
             'num_filters': 128, 
             'dropout_rate': 0.5, 
             'hidden_dimension': 500, 
             'train_mode': 'non_static', 
             'output_dimension': 43, 
             'trainable':True,
             'optimizer':'adam',
             'model_name': 'cnn_non_static_adam_w2v'}

In [6]:
rnn_kargs = {'vocab_size': vocab_size,
             'embedding_size': 300, 
             'dropout_rate': 0.5, 
             'lstm_dimension': 128, 
             'dense_dimension': 64, 
             'train_mode': 'pt', 
             'output_dimension': 43, 
             'optimizer':'radam',
             'model_name': 'lstm_pt_radam_pt_w2v'}

In [7]:
from models.model_evaluation import model_build, return_two_calss_acc

In [8]:
cnn_model = model_build(train_X, word_index, word_vectorizer, **cnn_kargs)

word index size :  30079
27423 2655
cnn
big_class
Model: "cnn_non_static_adam_w2v"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  9023700   
_________________________________________________________________
conv1d (Conv1D)              multiple                  115328    
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  153728    
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  192128    
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dropout (Dropout)            multiple                  0         
___________________________________________________________

In [69]:
professor_df = pd.read_excel('./application_data/application_benchmark.xlsx')

In [68]:
evaluation_data = np.load(open(TEST_DATA, 'rb'))
pred = cnn_model.predict(evaluation_data)

In [70]:
label_df = pd.DataFrame(np.argsort(-pred, axis=1)[:,:2], columns=['pred1', 'pred2'])
app_data = pd.concat([professor_df, label_df], axis=1)
app_data['pred2'] = app_data.pred2.apply(str).map(label_data)
app_data['pred1'] = app_data.pred1.apply(str).map(label_data)
app_data = app_data[['뉴스제목', 'pred1', 'pred2']]

### rnn model

In [71]:
rnn_model = model_build(train_X, word_index, word_vectorizer, **rnn_kargs)

word index size :  30079
27423 2655
rnn
big_class
Model: "lstm_pt_radam_pt_w2v"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  9023700   
_________________________________________________________________
bidirectional_2 (Bidirection multiple                  439296    
_________________________________________________________________
bidirectional_3 (Bidirection multiple                  394240    
_________________________________________________________________
dropout_2 (Dropout)          multiple                  0         
_________________________________________________________________
dense_4 (Dense)              multiple                  16448     
_________________________________________________________________
dense_5 (Dense)              multiple                  2795      
Total params: 9,876,479
Trainable params: 9,876,479
Non-traina

In [72]:
professor_df = pd.read_excel('./application_data/application_benchmark.xlsx')

In [73]:
evaluation_data = np.load(open(TEST_DATA, 'rb'))
pred = rnn_model.predict(evaluation_data)

In [74]:
label_df = pd.DataFrame(np.argsort(-pred, axis=1)[:,:2], columns=['pred1', 'pred2'])
app_data = pd.concat([professor_df, label_df], axis=1)
app_data['pred2'] = app_data.pred2.apply(str).map(label_data)
app_data['pred1'] = app_data.pred1.apply(str).map(label_data)
app_data = app_data[['뉴스제목', 'pred1', 'pred2']]

In [75]:
app_data

Unnamed: 0,뉴스제목,pred1,pred2
0,인하대 정보통신공학과 박인규 교수 연구팀 “미래동작 예측 인공지능기술개발 나서”,스마트미디어기기,로봇
1,세종대 교수 창업기업 자기센서 기술 세계 최초 개발,임베디드SW,스마트팩토리
2,UNIST 이상영-곽상규 교수팀 유기 골격 구조체 기반 고체 이온전도체 개발,스마트팩토리,정밀기계
3,한양대 장재영 교수 고분자로 만든 고성능 열전소재 개발,스마트팩토리,3D프린팅
4,울산과학기술원 박혜성-김건태-곽상규 교수팀 안정성 끝판왕 이종구조 수전해 촉매 개발,LED/광,디스플레이
...,...,...,...
728,전남대 윤경철 교수팀 안구건조 막는 콘택트렌즈 개발 나서,스마트시티,안전
729,우석대 양갑식 교수팀 인진쑥 이용 통풍 치료법 세계 첫 개발,반도체,바이오/헬스케어
730,화순전남대병원 민정준 교수 김동연 박사 연구팀 ‘악성 흑색종’ 탐지 초고감도 PET...,바이오/헬스케어,세라믹소재
731,이세중 대구한의대 교수팀 병원균 감염질환 제어 나노약물 전달시스템 개발,바이오/헬스케어,안전
