In [1]:
!pip install -q tokenizers
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM, Embedding, Masking
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers, processors
from sklearn.model_selection import train_test_split
import pickle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path='/content/drive/MyDrive/praca/c_train2'
data=pd.read_csv(f'{path}/c_train2.csv', sep='\t', encoding='utf-8').drop_duplicates(subset=['nl']).reset_index(drop=True)
x = data.nl.to_numpy()
y = data.code.to_numpy()

In [6]:
nl_tokenizer = Tokenizer(models.WordPiece(unk_token='[UNK]'))
nl_tokenizer.normalizer = normalizers.BertNormalizer(clean_text = False)
nl_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
nl_tokenizer.decoder = decoders.WordPiece(prefix='##')
trainer = trainers.WordPieceTrainer(
    vocab_size=8000,
    show_progress=True,
    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    continuing_subword_prefix='##'
)
nl_tokenizer.train_from_iterator(x, trainer=trainer)
nl_tokenizer.post_processor = processors.BertProcessing(('[CLS]',nl_tokenizer.token_to_id('[CLS]')),('[SEP]',nl_tokenizer.token_to_id('[SEP]')))

In [7]:
code_tokenizer = Tokenizer(models.WordPiece(unk_token='[UNK]'))
code_tokenizer.normalizer = normalizers.BertNormalizer(clean_text = False)
code_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
code_tokenizer.decoder = decoders.WordPiece(prefix='##')
trainer = trainers.WordPieceTrainer(
    vocab_size=8000,
    show_progress=True,
    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    continuing_subword_prefix='##'
)
code_tokenizer.train_from_iterator(y, trainer=trainer)
code_tokenizer.post_processor = processors.BertProcessing(('[CLS]',code_tokenizer.token_to_id('[CLS]')),('[SEP]',code_tokenizer.token_to_id('[SEP]')))

In [9]:
nl_tokenizer.save(f'{path}/nl_tokenizer.json')
code_tokenizer.save(f'{path}/code_tokenizer.json')

In [None]:
filter_x = np.array(list(map(lambda item: item.ids, nl_tokenizer.encode_batch(x))))
filter_y = np.array(list(map(lambda item: item.ids, code_tokenizer.encode_batch(y))))
length=400
nl_tokenizer.enable_padding()
code_tokenizer.enable_padding()
x=x[(np.array([len(i) for i in filter_y])<length) & (np.array([len(i) for i in filter_x])<length)]
y=y[(np.array([len(i) for i in filter_y])<length) & (np.array([len(i) for i in filter_x])<length)]
del filter_x
del filter_y
x_tokenized=np.array(list(map(lambda t: t.ids, nl_tokenizer.encode_batch(x))))
y_tokenized=np.array(list(map(lambda t: t.ids, code_tokenizer.encode_batch(y))))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_tokenized, y_tokenized, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

np.save(f'{path}/X_train.npy', X_train)
np.save(f'{path}/y_train.npy', y_train)
np.save(f'{path}/X_test.npy', X_test)
np.save(f'{path}/y_test.npy', y_test)
np.save(f'{path}/X_val.npy', X_val)
np.save(f'{path}/y_val.npy', y_val)

input_vocab_size=nl_tokenizer.get_vocab_size()
output_vocab_size=code_tokenizer.get_vocab_size()
input_vocab_size,output_vocab_size

(8000, 4865)

In [None]:
input_length=X_train.shape[1]
output_length=y_train.shape[1]
model = Sequential([
Masking(mask_value=0,input_shape=(input_length, 1)),
LSTM(input_length, return_sequences=False),
RepeatVector(output_length),
Bidirectional(LSTM(512, return_sequences=True),merge_mode='sum'),
Bidirectional(LSTM(768, return_sequences=True),merge_mode='sum'),
Dropout(0.2),
TimeDistributed(Dense(1024)),
Dropout(0.2),
TimeDistributed(Dense(units=output_vocab_size))
])
model.compile()
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_3 (Masking)         (None, 398, 1)            0         
                                                                 
 lstm_7 (LSTM)               (None, 398)               636800    
                                                                 
 repeat_vector_3 (RepeatVect  (None, 399, 398)         0         
 or)                                                             
                                                                 
 bidirectional_4 (Bidirectio  (None, 399, 512)         3731456   
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 399, 768)         7870464   
 nal)                                                            
                                                      

In [None]:
def loss_function(x, y):
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=y, y_pred=x)
  mask = tf.logical_not(tf.math.equal(y,0))
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss

buffer_size=32000
batch_size=150
Y_len = np.count_nonzero(y_train, axis=1)
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train, Y_len)).shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
valid_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val, np.count_nonzero(y_val, axis=1))).shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
optimizer = tf.keras.optimizers.Adam()

In [None]:
for epoch in range(5000):
  avg_loss = 0
  training_step = 0
  for x_train2, y_train2, data_len in train_ds:
      with tf.GradientTape() as tape:
          loss = loss_function(model(x_train2), y_train2)
      grads = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
      avg_loss += loss
      training_step += 1
  avg_loss /= training_step
  
  if (epoch + 1) % 10 == 0:
      avg_val_loss = 0
      val_training_step = 0
      for x_valid2, y_valid2, data_len2 in valid_ds:
          val_loss = loss_function(model(x_valid2), y_valid2)
          avg_val_loss += loss
          val_training_step += 1
      avg_val_loss /= val_training_step
      print('val_loss: {:.3f}'.format(avg_val_loss))
      print('Epoch: {:3}, tr_loss: {:.3f}'.format((epoch+1)/100, avg_loss))
      model.save(f'{path}/c_model2.h5')

In [12]:
sample_num=12
y_pred = model.predict(np.expand_dims(X_test[sample_num],axis=0),verbose=0)
y_pred = np.argmax(y_pred, axis=-1) 
print(nl_tokenizer.decode(X_test[sample_num]))
print(code_tokenizer.decode(y_pred[0]))
print(code_tokenizer.decode(y_test[sample_num]))

there are n mountains in a circle called mountain 1 mountain 2 mountain n in clockwise order n is an odd number between these mountains there are n damc called dam 1 dam 2 dam n dam i 1 leq i leq n is located between mountain i and i+1 mountain n+1 is mountain 1 when mountain i 1 leq i leq n receives 50000 liters of rain dam i-1 and dam i each accumulates x liters of water dam 0 is dam n one day each of the mountains received a non-negative even number of liters of rain as a result dam i 1 leq i leq n associated a total of ai liters of water find the amount of rain each of the mountains received we can chords that the solution is unique under the constraints of this problem
# ( ) ( + ] id5 0, id6 ; id2 < id5 id5 id6 id6 id6 ; ( id6 ( ", id2 id0, id2 [ id2 ], id2 [ id2 ], id3 [ id6 ] id6 [ id6 id6 + id6 [ id6 id6 ( ] [ )? ] [ id4 ( ( ] [ id4 [ id8 ] id4 + id4 id4 id4 id4, id0 ( id0 ( id0 ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ) ) ) ) ) ) ) ) ) ) ) ) ) ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( += += 

In [13]:
sample_num=0
y_pred = model.predict(np.expand_dims(X_train[sample_num],axis=0),verbose=0)
y_pred = np.argmax(y_pred, axis=-1)
print(nl_tokenizer.decode(X_train[sample_num]))
print(code_tokenizer.decode(y_pred[0]))
print(code_tokenizer.decode(y_train[sample_num]))

for given three points p1 p2 p find the projection point x of p onto p1p2
# include < id4 > id10 ( ) { > double id0, id12, id7, id13, id2, id12 ; int, ( ) { scanf (, %lf%lf%lf%lf ;, & " id11 ; id13 ( " -c\"input();print('yneos'[,, id12 id5 ; id2 -=, & id12, & ; double = = = ; - id2 = id12 ; id2 = id5 ; id12 = id5 ; ; ; id5 ; scanf scanf " " " & & id1 id1 id1 ; ; ; ; = = = ; ; ; ; ; ; " " " " " " " & & & & id11 id11 ; ; ; ; = = * * * * * * * * * * * + + + + + + + + + + + + + + + + + + } } } } id12 id12 id12 id12 id12 id12 id12 id12 id12 id12 id12 id12 * * * * * * * * * * * * id12 id12 id12 id12 + + + + %.9llf\n %.9llf\n %.9llf\n %.9llf\n 557 557 ) )
# include < id3 > # include < id6 > # define id13 id5 double id0, id13, id2, id12 ; int id7 ( ) { scanf ( " %lf%lf%lf%lf ", & id0, & id13, & id2, & id12 ) ; id2 -= id0 ; id12 -= id13 ; double id8 = id10 ( id2, id12 ) ; id2 = id8 ; id12 = id8 ; int id1 ; scanf ( " %d ", & id1 ) ; while ( id1 -- ) { double id4, id11 ; scanf ( " %lf%lf ", & id4