### Machine_translation(English to French)
#### 1. 데이터 불러오기

In [11]:
import collections
from jupyter_helpers import *
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [None]:
# GPU check
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [13]:
import os

def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r", encoding='utf-8') as f:
        data = f.read()

    return data.split('\n')

In [15]:
# Load English data
english_sentences = load_data('C:/Users/mark/Desktop/machine_translator_project/data/small_vocab_en')
# Load French data
french_sentences = load_data('C:/Users/mark/Desktop/machine_translator_project/data/small_vocab_fr')

print('Dataset Loaded')

Dataset Loaded


#### 2. small_vocab_en, small_vocab_fr 데이터 첫번째, 두번째 줄 확인

In [16]:
for sample_i in range(2):
    print('small_vocab_en {}번째 줄:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_kr {}번째 줄:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en 1번째 줄:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_kr 1번째 줄:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en 2번째 줄:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_kr 2번째 줄:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


#### 학습하는 단어의 수가 많을 수록(다양할수록) 각 특성을 파악하는 일이 많아진다.
#### 3. 데이터 셋을 파악한다.

In [17]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{}개의 영단어'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{}개의 유일한 영단어'.format(len(english_words_counter)))
print('가장 출현빈도가 높은 10개의 영단어: ')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()

print('{}개의 프랑스어단어'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{}개의 유일한 프랑스어단어'.format(len(french_words_counter)))
print('가장 출현빈도가 높은 10개의 프랑스어 단어: ')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250개의 영단어
227개의 유일한 영단어
가장 출현빈도가 높은 10개의 영단어: 
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295개의 프랑스어단어
355개의 유일한 프랑스어단어
가장 출현빈도가 높은 10개의 프랑스어 단어: 
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


#### 4. 단어의 토큰화(Tokenize)
#### keras의 tokenizer 기능을 사용하여 각 문장을 단어ID 값으로 변환한다.

In [19]:
def tokenize(x):
  tknzer = Tokenizer()
  tknzer.fit_on_texts(x)
  data = tknzer.texts_to_sequences(x) 

  return data, tknzer

text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


#### 5. Padding 
#### 다양한 문장의 길이를 동일한 길이로 통일시킨다. 각 시퀀스의 끝에 패딩을 추가하였다.

In [20]:
def pad(x, length=None):
    
  if length == None: 
    length = max(map(len, x))
    padded = pad_sequences(x, maxlen=length, padding='post')
  else:
    padded = pad_sequences(x, maxlen=length, padding='post')

  return np.array(padded) 

test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1)) # 내가 원하는 결과 예시: [1 2 4 5 6 7 8 9 0]
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]
