# **Protein Family Prediction**

This notebook is intended to be run using Amazon Sagemaker. Sections 1) to 3) of this notebook are very similar to the Protein_Family_Prediction.ipynb file, therefore just a few comments have been included in these sections. More extensive comments in section 4), which is about model deployment

# Importing the dataset and libraries

In [None]:
import pandas as pd

import os
import io
import boto3


import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns


import tensorflow as tf
import keras
from keras import backend as K
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer


import sagemaker
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac


import warnings
warnings.filterwarnings("ignore")



## 1) Dataset loading and initial preprocessing

The two dataset loaded were previously uploaded to an AWS S3 bucket ("awsml-159159159").

In [None]:
#1) Dataset loading and initial preprocessing
bucket="awsml-159159159"
path1="pdb_data_no_dups.csv"
path2="pdb_data_seq.csv"

#Loading the dataset
def load_data(path1, path2):
  data1 = pd.read_csv(path1)
  data2 = pd.read_csv(path2)
  return data1, data2

data1, data2 = load_data(f"s3://{bucket}/{path1}", f"s3://{bucket}/{path2}")

#Merging the two datasets and dropping duplicates (according to the feature "structureId")
def merge_and_drop_duplicates_id(data1, data2):
  data=pd.merge(data1, data2, on=["structureId"])
  data.drop_duplicates(subset=['structureId'], inplace=True)
  return data

data_merged = merge_and_drop_duplicates_id(data1, data2)


#Selection of the most common 3 classes of the output variable
def classes_selection(data):
  classes = ['HYDROLASE', 'TRANSFERASE', 'OXIDOREDUCTASE']
  data = data[data.classification.isin(classes)]
  ax=sns.countplot(x=data["classification"])
  data.replace({"HYDROLASE":0,"TRANSFERASE":1,"OXIDOREDUCTASE":2},inplace=True)
  return data

data_final = classes_selection(data_merged)


## 2) Main preprocessing

In [None]:
#2) Main preprocessing

#Dropping duplicates according to sequence
def dropping_duplicates_sequences(data):
  data.drop_duplicates(subset='sequence', inplace=True)
  data=data[["classification","sequence"]]
  data.dropna(subset=["sequence"], inplace=True)
  return data

data_final=dropping_duplicates_sequences(data_final)



#Plotting lengths sequences of proteins
def plot_sequence_count(data_final):
    sequences=data_final.sequence.values

    val_lengths=[]
    for i in range(len(sequences)):
      val_lengths.append(len(sequences[i]))

    sns.histplot(val_lengths, color='g')
    plt.xlabel('Sequence Length')
    plt.ylabel('Frequency')
    plt.title('Sequence Character Count')
    plt.xlim(0,1000)


    return sequences, val_lengths

sequences, lengths=plot_sequence_count(data_final)



#Conversion of amino acids sequences into an integer matrix (x)
def conv_to_matrix_and_delete_wrong_measur(max_length, lengths, sequences):
  tokenizer = Tokenizer(char_level=True)
  tokenizer.fit_on_texts(sequences)
  x_seq = tokenizer.texts_to_sequences(sequences)
  x = pad_sequences(x_seq, maxlen=max_length)

  final=np.arange(x.shape[0])
  values_kept=[]
  wrong_letters=5
  index_row=row=0
  for index_row in final:
    if (tokenizer.word_index["u"] in x[row,:]) or \
       (tokenizer.word_index["x"] in x[row,:]) or \
       (tokenizer.word_index["z"] in x[row,:]) or \
       (tokenizer.word_index["b"] in x[row,:]) or \
       (tokenizer.word_index["o"] in x[row,:]):
       x=np.delete(x, (row), axis=0)
       row=row-1
    else:
      values_kept.append(index_row)
    row=row+1

  wrong_letters = 5
  length_dict = len(tokenizer.word_index) - wrong_letters

  return x, values_kept, length_dict, max_length

x, values_kept, length_dict, max_length = conv_to_matrix_and_delete_wrong_measur(350,
                                                      lengths, sequences)



#One hot encoding of the output variable and dataset splitting
def ohe_and_dataset_splitting(x, data, values_kept):
  lb = LabelBinarizer()
  y=data["classification"].iloc[values_kept]
  y=lb.fit_transform(y)

  x, x_test, y, y_test = train_test_split(x,y, test_size=0.2, random_state=15)
  x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.25, random_state=15)

  return x_train, x_val, x_test, y_train, y_val, y_test

x_train, x_val, x_test, y_train, y_val, y_test = ohe_and_dataset_splitting(x,
                                                        data_final, values_kept)

## 3) Metrics

In [None]:
#Macro F1-score
def recall(y_true, y_pred):
    y_true = y_true [:,0]
    y_pred = y_pred [:,0]
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall_v = true_positives / (possible_positives + K.epsilon())
    return recall_v

def precision(y_true, y_pred):
    y_true = y_true [:,0]
    y_pred = y_pred [:,0]
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision_v = true_positives / (predicted_positives + K.epsilon())
    return precision_v

def f1score(y_true, y_pred):
    precision_v = precision(y_true, y_pred)
    recall_v = recall(y_true, y_pred)
    return 2*((precision_v*recall_v)/(precision_v+recall_v+K.epsilon()))

## 4) Model Deployment

The pretrained model is imported from the an AWS S3 bucket, where it has been previoulsy uploaded, and then loaded using the code below

In [None]:
s3=boto3.client('s3')
bucket_name='awsml-159159159'
file_key='LSTM_model_protein.h5'
local_file_path='LSTM_model_protein.h5' 

s3.download_file(bucket_name, file_key, local_file_path)


local_file_path='LSTM_model_protein.h5' 
LSTM_model_protein=tf.keras.models.load_model(local_file_path, custom_objects={'f1score':f1score})

The script below creates a SageMaker session and get the IAM execution role. Then it specifies the S3 location of the LSTM model, which is stored in h5 format and creates an S3 location for storing the output artifacts, which will be used during the deployment process.

Finally it creates a SageMaker Model object and deploy the model using the specified parameters.

In [None]:
sagemaker_session=sagemaker.Session()
role=get_execution_role()

model_path=f's3://{bucket}/LSTM_model_protein.h5' 
output_location=f's3://{bucket}/model-artifact/'

model=sagemaker.Model(model_data=model_path,
                      role=role,
                      image_uri="763104351884.dkr.ecr.us-east-2.amazonaws.com/tensorflow-training:2.12.0-cpu-py310-ubuntu20.04-sagemaker",
                      sagemaker_session=sagemaker_session)

model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

The following code sets up the protein_predictor endpoint to handle input data in CSV format using CSVSerializer. It also sets the JSONDeserializer to handle the JSON-formatted predictions returned by the endpoint. Predictions on the test set are stored in the variable results. Finally, the variable predictions is filled by extracting the 'predicted_label' values from the results variable,  which contains the predictions made by the protein_predictor endpoint on the test set.

In [None]:
protein_predictor.serializer=sagemaker.serializers.CSVSerializer()
protein_predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

results=protein_predictor.predict(x_test)
predictions=[]
predictions+=[r['predicted_label'] for r in results['predictions']]
predictions=np.array(predictions)