# Task 3: Embeddings for primary and secondary structures

## Install required libraries and auxiliary functions

In [None]:
# We install the required libraries 
# We use older versions of TF and numpy for compatibility
!pip install bert-tensorflow
!python3.7 -m pip install tensorflow-gpu==1.15.0
!pip install -U numpy==1.18.5

In [None]:
import os
import numpy as np

In [None]:
# We mount a google account's Drive to work in Google Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
'''
Move to the dir where we have our code and clone the BERT 
github repo so we can import modeling, tokenization and 
optimization python files from it
'''

os.chdir('/content/drive/MyDrive/Permed_Task_3/')

In [None]:
'''
Auxiliary functions to avoid embeddings higher than 1 
or lower than one and to add the required padding 
if necessary
'''
def clip_long_seq(seq, max_len):
  if(len(seq) > max_len):
    seq = seq[0:max_len]
  return seq
def pad_sequence(seq,max_len):
  while(len(seq) < max_len):
    seq = seq + "0"
  return seq

## Primary structure

### Read the database and create windows of 30 aminoacids

In [None]:
%cd Primary_structure

In [None]:
'''
Read the Database file and remove NewLine char(\n)
'''

entries = []
with  open('Database.txt') as fp:
    contents = fp.read()
    for entry in contents.split('-'):
      entry = entry.replace('\n','')
      entries.append(entry)
entries.pop(0)

In [None]:
'''
Create windows of 30 aminoacids
'''
n = 30
strings_30 = []
for entry in entries:
  split_strings = [entry[index : index + n] for index in range(0, len(entry), n)]
  '''for string in split_strings:
    if(len(string)<n):
      split_strings = split_strings[:-1]
      string = pad_sequence(string,n)
      split_strings.append(string)'''

  strings_30.append(split_strings)

In [None]:
file_write = open('database_30.txt','w')
for string in strings_30:
  for s in string:
    file_write.write(s + "\n")

### Add spaces between tokens for the BERT model

In [None]:
'''
For the regular BERT model, we need a space between each aminoacid
This token is crucial for creating a training dataset and 
for running BERT over the database
'''
database_file = 'database_30.txt'
file1 = open(database_file, 'r')
Lines = file1.readlines()
len(Lines)

In [None]:
final_lines = []
for line in Lines:
  line = (line.replace("", " ")[1: -1])
  final_lines.append(line)

In [None]:
file_write = open('database_30_spaces.txt','w')
for line in final_lines:
  file_write.write(line)

### Obtaining feature vectors with TAPE library

In [None]:
!pip install tape_proteins
#!pip install awscli --ignore-installed six

In [None]:
'''
Install TAPe and create a model from pretrained
The TAPE tokenizer does not require spaces between 
each aminoacid
'''
import torch
from tape import ProteinBertModel, TAPETokenizer
from tape import UniRepModel
import numpy as np
model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')

In [None]:
database_file ='database_30.txt'
file1 = open(database_file,'r')
Lines = file1.readlines()
len(Lines)

In [None]:
# a counter variable is set to write the number of the subsequence
ctr = 0
file_write = open("output_primary.txt","w")
for s in Lines:
  # remove \n character and clip sequences if it was necessary (we are using 30 as max_length so no required)
  s = s.replace('\n','')
  s = clip_long_seq(s, 1024)
  # define the ids for the BERT model
  token_ids = torch.tensor([tokenizer.encode(s)])
  # create the output of the model
  output = model(token_ids)
  sq = torch.squeeze(output[0],dim=0)
  # output[0] is averaged to obtain the final embedding
  avg_output = torch.mean(sq,0)
  avg_output_corr = torch.clip(avg_output, min=-1,max=1)
  #print(torch.mean(avg_output_corr))
  file_write.write(str(ctr)+"\n")
  file_write.write(str(avg_output_corr) + "\n")
  ctr+=1
  #pooled output is not pretrained, so it is better to average input[0]
  #pooled_output = output[1]
  #file_write.write(str(avg_output_corr) + "\n")

### Generate pretraining data, training and running (if no TAPE)

In [None]:
!python create_pretraining_data2.py --input_file=./database_30_spaces.txt
                                    --output_file=./tf_examples.tfrecord 
                                    --vocab_file=./protein_vocab.txt
                                    --do_lower_case=True 
                                    --max_seq_length=30 
                                    --max_predictions_per_seq=5
                                    --masked_lm_prob=0.15 
                                    --random_seed=12345 
                                    --dupe_factor=5

In [None]:
!python run_pretraining.py --input_file=./tf_examples.tfrecord     
                            --output_dir=./outputs  
                            --do_train=True     
                            --do_eval=True     
                            --bert_config_file=./bert_config.json     
                            --train_batch_size=32     
                            --max_seq_length=30     
                            --max_predictions_per_seq=5     
                            --num_train_steps=14000000    
                            --num_warmup_steps=10000     
                            --learning_rate=1e-4     
                            --save_checkpoints_steps=10000

In [None]:
!python extract_features4.py -–input_file=/home/pnunez/data/bert/sara/protein_subsequences.txt 
                              -–output_file=/home/pnunez/data/bert/sara/output_primary.txt 
                              -–vocab_file=/home/pnunez/data/bert/protein-vocab.txt 
                              -–bert_config_file=/home/pnunez/data/bert/bert_config.json 
                              -–init_checkpoint=/home/pnunez/data/bert/model2/model.ckpt-14000000 
                              --max_seq_length=30 
                              --layers=-1
                              -–batch_size=32  

## Secondary structure

### Read predictions file and remove unnecesary characters


In [None]:
%cd ../Secondary_structure

In [None]:
'''
Read the prediction file from the U-NET and verify the
number of lines in the file
'''

prediction_file = 'prediction.txt'
file1 = open(prediction_file, 'r')
Lines = file1.readlines()
final_lines=[]
for line in Lines:
  line=line.replace('\n','')
  final_lines.append(line)

In [None]:
print(len(Lines))
print(len(final_lines))

### Divide the predictions in windows of 30 and write a new file

In [None]:
'''
This cell splits the previous lines in windows of 30 and adds 
'''

n = 30
strings_30 = []
chars = []
for line in final_lines:
  split_strings = [line[index : index + n] for index in range(0, len(line), n)]
  '''for string in split_strings:
    if(len(string)<n):
      split_strings = split_strings[:-1]
      string = pad_sequence(string,n)
      split_strings.append(string)'''
  '''for split in split_strings:
    for char in split:
      char = char + ' ' '''

  strings_30.append(split_strings)

In [None]:
file_write = open('predictions_30.txt','w')
for string in strings_30:
  for s in string:
    file_write.write(s + "\n")

### Add spaces between tokens for the BERT model

In [None]:
prediction_file = 'predictions_30.txt'
file1 = open(prediction_file, 'r')
Lines = file1.readlines()
len(Lines)

In [None]:
final_lines = []
for line in Lines:
  line = (line.replace("", " ")[1: -1])
  final_lines.append(line)

In [None]:
file_write = open('predictions_30_spaces.txt','w')
for line in final_lines:
  file_write.write(line)

### Generate pretraining data and training

In [None]:
!python create_pretraining_data2.py --input_file=./predictions_30_spaces.txt 
                                    --output_file=./tf_examples.tfrecord 
                                    --vocab_file=./secondary-vocab.txt 
                                    --do_lower_case=True 
                                    --max_seq_length=30 
                                    --max_predictions_per_seq=5 
                                    --masked_lm_prob=0.15 
                                    --random_seed=12345 
                                    --dupe_factor=5

In [None]:
!python run_pretraining.py --input_file=./tf_examples.tfrecord 
                            --output_dir=./output 
                            --do_train=True 
                            --do_eval=True 
                            --bert_config_file=./bert_config.json 
                            --train_batch_size=32 
                            --max_seq_length=30 
                            --max_predictions_per_seq=5 
                            --num_train_steps=500000 
                            --num_warmup_steps=10000 
                            --learning_rate=1e-4 
                            --save_checkpoints_steps=10000

In [None]:
!python extract_features4.py --input_file=predictions_30_spaces.txt 
                              --output_file=./output/output_secondary.txt 
                              --vocab_file=./secondary-vocab.txt 
                              --bert_config_file=./bert_config.json 
                              --init_checkpoint=model.ckpt-500000 
                              --max_seq_length=30 
                              --layers=-1 
                              --batch_size=32