In [None]:
!pip install -q transformers
!pip install -U -q tensorflow-text==2.9.0

[K     |████████████████████████████████| 5.8 MB 19.6 MB/s 
[K     |████████████████████████████████| 182 kB 67.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 26.3 MB/s 
[K     |████████████████████████████████| 4.6 MB 35.6 MB/s 
[?25h

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM, Embedding, Masking
from transformers import TFBertTokenizer, BertTokenizer, BertForMaskedLM
from sklearn.model_selection import train_test_split
import pickle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tokenizer = TFBertTokenizer.from_pretrained("bert-base-uncased")
detokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
x=np.load('/content/drive/MyDrive/x.npy',allow_pickle=True)
y=np.load('/content/drive/MyDrive/y.npy',allow_pickle=True)
x.shape

(251820,)

In [None]:
x_tokenized=tokenizer([" ".join(sentence) for sentence in x])['input_ids']
del x
y_tokenized=tokenizer([" ".join(sentence) for sentence in y])['input_ids']
del y

In [None]:
count=np.count_nonzero(y_tokenized, axis=1)
x_tokenized=x_tokenized[count < 80]
y_tokenized=y_tokenized[count < 80]
x_tk_len=max(np.count_nonzero(x_tokenized, axis=1))
y_tk_len=max(np.count_nonzero(y_tokenized, axis=1))

test_x=x_tokenized.numpy()
temp=[]
for i in range(test_x.shape[0]):
  temp.append(test_x[i][:x_tk_len])
test_x=np.array(temp)

test_y=y_tokenized.numpy()
temp=[]
for i in range(test_y.shape[0]):
  temp.append(test_y[i][:y_tk_len])
test_y=np.array(temp)
del temp

In [None]:
def create_word_dict(tokens):
  token_id=0
  my_dict = {}
  for token in tokens:
    if token not in my_dict:
      my_dict[token] = token_id
      token_id+=1
  return my_dict

x_dict=create_word_dict(np.unique(np.unique(test_x, axis=0)))
y_dict=create_word_dict(np.unique(np.unique(test_y, axis=0)))

for i in range(test_x.shape[0]):
  for j in range(test_x.shape[1]):
    test_x[i][j]=x_dict[test_x[i][j]]

for i in range(test_y.shape[0]):
  for j in range(test_y.shape[1]):
    test_y[i][j]=y_dict[test_y[i][j]]

del x_tokenized
del y_tokenized

X_train, X_test, y_train, y_test = train_test_split(test_x, test_y, test_size=0.05, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

input_vocab_size=np.unique(np.unique(test_x, axis=0)).size
output_vocab_size=np.unique(np.unique(test_y, axis=0)).size
inv_x = {v: k for k, v in x_dict.items()}
inv_y = {v: k for k, v in y_dict.items()}
input_vocab_size,output_vocab_size

(11171, 11534)

In [None]:
path='/content/drive/MyDrive/py_train'
np.save(f'{path}/X_train.npy', X_train)
np.save(f'{path}/y_train.npy', y_train)
np.save(f'{path}/X_test.npy', X_test)
np.save(f'{path}/y_test.npy', y_test)
np.save(f'{path}/X_val.npy', X_val)
np.save(f'{path}/y_val.npy', y_val)
with open(f'{path}/x_dict.pkl', 'wb') as f:
    pickle.dump(x_dict, f)
with open(f'{path}/y_dict.pkl', 'wb') as f:
    pickle.dump(y_dict, f)

In [None]:
input_length=X_train.shape[1]
output_length=y_train.shape[1]
model = Sequential([
Masking(mask_value=0,input_shape=(input_length, 1)),
Bidirectional(LSTM(input_length, return_sequences=False)),
RepeatVector(output_length),
LSTM(512, return_sequences=True),
LSTM(1024, return_sequences=True),
Dropout(0.2),
TimeDistributed(Dense(1024)),
Dropout(0.2),
TimeDistributed(Dense(units=output_vocab_size))
])
model.compile()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 141, 1)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 282)              161304    
 l)                                                              
                                                                 
 repeat_vector (RepeatVector  (None, 79, 282)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 79, 512)           1628160   
                                                                 
 lstm_2 (LSTM)               (None, 79, 1024)          6295552   
                                                                 
 dropout (Dropout)           (None, 79, 1024)          0

In [None]:
def loss_function(x, y):
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=y, y_pred=x)
  mask = tf.logical_not(tf.math.equal(y,0))
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss

buffer_size=32000
batch_size=240
Y_len = np.count_nonzero(y_train, axis=1)
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train, Y_len)).shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
valid_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val, np.count_nonzero(y_val, axis=1))).shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
optimizer = tf.keras.optimizers.Adam()

In [None]:
for epoch in range(5000):
  avg_loss = 0
  training_step = 0
  for x_train2, y_train2, data_len in train_ds:
      with tf.GradientTape() as tape:
          loss = loss_function(model(x_train2), y_train2)
      grads = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
      avg_loss += loss
      training_step += 1
  avg_loss /= training_step
  
  if (epoch + 1) % 10 == 0:
      avg_val_loss = 0
      val_training_step = 0
      for x_valid2, y_valid2, data_len2 in valid_ds:
          val_loss = loss_function(model(x_valid2), y_valid2)
          avg_val_loss += loss
          val_training_step += 1
      avg_val_loss /= val_training_step
      print('val_loss: {:.3f}'.format(avg_val_loss))
      print('Epoch: {:3}, tr_loss: {:.3f}'.format((epoch+1)/10, avg_loss))
      model.save('/content/python_model.h5')

In [None]:
def remove_padding(sentence):
  clean_text=[]
  for el in sentence.split(' ')[1:]:
    if el != '[SEP]':
      clean_text.append(el)
    else:
      break
  return ' '.join(clean_text)

In [None]:
sample_num=7
y_pred = model.predict(np.expand_dims(X_test[sample_num],axis=0),verbose=0)
y_pred = np.argmax(y_pred, axis=-1) 
print(remove_padding(detokenizer.decode([inv_x[token] for token in X_test[sample_num]])))
print(remove_padding(detokenizer.decode([inv_y[token] for token in y_pred[0]])))
print(remove_padding(detokenizer.decode([inv_y[token] for token in y_test[sample_num]])))

get the group s admins
def _ ( self ) : g sections _ shuffle _ models ) ) if self. next _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ p _ ) ) ) )
def get _ group _ admin ( self, group ) : data = {'name': group, } response = _ fix _ group ( self. post ('getgroupadmin ', data ) ) return response


In [None]:
sample_num=4
y_pred = model.predict(np.expand_dims(X_train[sample_num],axis=0),verbose=0)
y_pred = np.argmax(y_pred, axis=-1)
print(remove_padding(detokenizer.decode([inv_x[token] for token in X_train[sample_num]])))
print(remove_padding(detokenizer.decode([inv_y[token] for token in y_pred[0]])))
print(remove_padding(detokenizer.decode([inv_y[token] for token in y_train[sample_num]])))

start query after a cursor with this collection as parent.
def action ( self task : conn ( self, : =s, * ) : if = self. get. self _ self _ self _ self _ _ self _ self _ self _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
def start _ after ( self, document _ fields ) : query = query _ mod. query ( self ) return query. start _ after ( document _ fields )
