# 

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt

### Convert AA string to array of vectors
###   This includes distance to RNA 

In [None]:
def build_input_vector(sequence):

    """
    Takes a string of characters representing an Amino acid sequence and converts it
    into a Dataframe consisting of vectors that represent each amino acid in the 
    sequence

    also takes distance to RNA and data represented by capitalisation and creates a binary variable
    """
    
    vectorised_seq_df = pd.DataFrame(columns = ['AA_vector'])
    
    for i in sequence:
        # Positively Charged AAs
        # Arginine - Pos 1
        if i == 'R':
            i_list = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        if i == 'r':
            i_list = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Histidine - Pos 2
        elif i == 'H':
            i_list = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'h':
            i_list = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Lysine - Pos 3
        elif i == 'K':
            i_list = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'k':
            i_list = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Negatively Charged AAs
        # Aspartic Acid - Pos 4
        elif i == 'D':
            i_list = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'd':
            i_list = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Glutamic Acid - Pos 5
        elif i == 'E':
            i_list = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'e':
            i_list = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Polar Uncharged AAs
        # Serine - Pos 6
        elif i == 'S':
            i_list = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 's':
            i_list = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Threonine - Pos 7 
        elif i == 'T':
            i_list = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 't':
            i_list = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Asparagine  - Pos 8 
        elif i == 'N':
            i_list = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'n':
            i_list = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Glutamine - Pos 9 
        elif i == 'Q':
            i_list = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'q':
            i_list = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Special cases - come back to this
        # Glycine - Pos 10
        elif i == 'G':
            i_list = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'g':
            i_list = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Proline - Pos 11
        elif i == 'P':
            i_list = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'p':
            i_list = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Cysteine - Pos 12
        elif i == 'C':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'c':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Selenocystein - Pos 13
        elif i == 'U':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'u':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Hydrophobic AAs
        # Alanine - Pos 14
        elif i == 'A':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'a':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Valine - Pos 15
        elif i == 'V':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'v':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Isoleucine - Pos 16
        elif i == 'I':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'i':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Leucine - Pos 17
        elif i == 'L':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'l':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Methionine - Pos 18
        elif i == 'M':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'm':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Phenylalanine - Pos 19
        elif i == 'F':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'f':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Tyrosine - Pos 20
        elif i == 'Y':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'y':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Tryptophan - Pos 21
        elif i == 'W':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        elif i == 'w':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

        # Null (used for spaces where there is no amino acid
        elif i == '-':
            i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
            i_vector = np.array(i_list)
            vectorised_seq_df.loc[len(vectorised_seq_df)] = [i_vector]

    return vectorised_seq_df

### Original function without dist_to_RNA 

In [None]:
def return_AA_vector_noDist(AA):

    """
    Takes a string of characters representing an Amino acid sequence and converts it
    into a Dataframe consisting of vectors that represent each amino acid in the 
    sequence

    This does not include SelenoCysteine  
    """
    # Positively Charged AAs
    # Arginine - Pos 1
    if AA == 'R':
        i_list = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
        i_vector = np.array(i_list)

    # Histidine - Pos 2
    elif AA == 'H':
        i_list = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Lysine - Pos 3
    elif AA == 'K':
        i_list = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Negatively Charged AAs
    # Aspartic Acid - Pos 4
    elif AA == 'D':
        i_list = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Glutamic Acid - Pos 5
    elif AA == 'E':
        i_list = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Polar Uncharged AAs
    # Serine - Pos 6
    elif AA == 'S':
        i_list = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Threonine - Pos 7 
    elif AA == 'T':
        i_list = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Asparagine  - Pos 8 
    elif AA == 'N':
        i_list = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Glutamine - Pos 9 
    elif AA == 'Q':
        i_list = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Special cases - come back to this
    # Glycine - Pos 10
    elif AA == 'G':
        i_list = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Proline - Pos 11
    elif AA == 'P':
        i_list = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Cysteine - Pos 12
    elif AA == 'C':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Selenocystein - Pos 13
    elif AA == 'U':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Hydrophobic AAs
    # Alanine - Pos 14
    elif AA == 'A':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Valine - Pos 15
    elif AA == 'V':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Isoleucine - Pos 16
    elif AA == 'I':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Leucine - Pos 17
    elif AA == 'L':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]
        i_vector = np.array(i_list)

    # Methionine - Pos 18
    elif AA == 'M':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
        i_vector = np.array(i_list)


    # Phenylalanine - Pos 19
    elif AA == 'F':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
        i_vector = np.array(i_list)
        
    # Tyrosine - Pos 20
    elif AA == 'Y':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]
        i_vector = np.array(i_list)

    # Tryptophan - Pos 21
    elif AA == 'W':
        i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
        i_vector = np.array(i_list)


    else:
        print("Mahoraga HELP ME")


    return i_vector

### Stage Training data in a variable

In [None]:
training_data = pd.read_csv('All_combined_results.txt', sep='\t')

### Take a list of IDs and generate a list amino acid sequence with capitalisation based on 

In [None]:
IDs_only = training_data.ID

IDs_only = training_data['ID'].to_list()

unique_IDs = set(IDs_only)

sequences = {}

for ID in unique_IDs:
    search_list = [ID]
    temp_df = training_data.loc[training_data['ID'].isin(search_list)]
    temp_df.loc[temp_df['Distance_to_RNA'] > 4.2, 'amino_acid'] = temp_df['amino_acid'].str.lower()
    sequences[ID] = temp_df['amino_acid'].astype(str).str.cat(sep='')

In [None]:
vectorised_sequences = sequences

for ID in sequences:
    vectorised_sequences[ID] = build_input_vector(sequences[ID])

### Get Tripeptide presence 

In [None]:
req_columns = ["No", "ID", 'pdb_id', 'residue_number', 'amino_acid', 'Distance_to_RNA']

tripeptide_ready_data = training_data[req_columns]
tripeptide_ready_data['tripeptide_significance'] = pd.NA

In [None]:
top20_tripep_ordered = pd.read_csv('Top20_tripeptides_ordered.csv')
top20_tripep_disordered = pd.read_csv('Top20_tripeptides_disordered.csv')

top20_tripep_combined = pd.concat([top20_tripep_ordered, top20_tripep_disordered], ignore_index= True)

In [None]:
top20_tripep_combined.rename(columns={'0': 'tripeptides'}, inplace= True)

In [None]:
test = ['M' 'R' 'A']

significant_tripeptides = top20_tripep_combined['tripeptides'].values

significant_tripeptides = significant_tripeptides.tolist()




### Tripeptide significance Tagging Function

In [None]:
def getTestTripeptide(df, tripeptide_list):

    Significance_df = pd.DataFrame()
    # Iterate over rows in the data frame
    uniprot_ids = sorted(set(df.loc[:,'ID']))
    for uniprot_id in uniprot_ids:
        uniprot_df = df[df['ID'] == uniprot_id]
        residue_numbers = uniprot_df['residue_number'].values
        for i in uniprot_df.index:
            tripeptides = list()

    # First Amino Acid
            if i == uniprot_df.index[0]:

                    """ [A]-T-C-Y-K-P-Y-L
                         0  1 2 3 4 5 6 7
                    
                    Should only return ATC
                    
                    """
                    tripeptide = uniprot_df.loc[i:i+2,'amino_acid'].values
                    
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at if statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [1]

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [0]

    # Second Amino Acid
            elif i == uniprot_df.index[1]:

                    """  A-[T]-C-Y-K-P-Y-L
                         0  1  2 3 4 5 6 7
                    
                    Should only return ATC and TCY
                    
                    """
                    
                    tripeptide = uniprot_df.loc[i-1:i+1,'amino_acid'].values
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at elif #1 statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [1]

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [0]
                    
                    tripeptide = uniprot_df.loc[i:i+2,'amino_acid'].values
                    
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at elif #2 statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'].append(1)

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'].append(0)

    # Last Amino Acid
            elif i == uniprot_df.index[-1]:
                    
                    """  A-T-C-Y-K-P-Y-[L]
                         0 1 2 3 4-3-2 -1
                         
                    Should only return PYL
                    
                    """
                    tripeptide = uniprot_df.loc[i-2:i,'amino_acid'].values
                    
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at elif #3 statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [1]

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [0]

    # Second Last Amino Acid
            elif i == uniprot_df.index[-2]:
                    
                    """  A-T-C-Y-K-P-[Y]-L
                         0 1 2 3 4-3 -2 -1
                         
                    Should only return PYL and KPY
                    
                    """
                    tripeptide = uniprot_df.loc[i-2:i,'amino_acid'].values
                    
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at elif #4 statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [1]

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [0]
                    
                    tripeptide = uniprot_df.loc[i-1:i+1,'amino_acid'].values
                    
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at elif #2 statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'].append(1)

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'].append(0)
    # All Other AAs
            else: 
                    
                    tripeptide = uniprot_df.loc[i-2:i,'amino_acid'].values
                    
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at else #1 statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [1]

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'] = [0]
                        
                    tripeptide = uniprot_df.loc[i-1:i+1,'amino_acid'].values
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at else #2 statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'].append(1)

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'].append(0)
                        
                    tripeptide = uniprot_df.loc[i:i+2,'amino_acid'].values
                    if len(tripeptide) != 3: 
                        sys.stderr.write(f"{tripeptide}\tERROR!!!! at else #3 statement.\n")
                    else:
                        tripeptide = [''.join(tripeptide)]
                        if tripeptide in tripeptide_list:
                            uniprot_df.loc[i, 'tripeptide_significance'].append(1)

                        else:
                            uniprot_df.loc[i, 'tripeptide_significance'].append(0)
        
        Significance_df = pd.concat([Significance_df ,uniprot_df], ignore_index=True)

    return(Significance_df)
                        

In [None]:
post_tripeptide_df = getTestTripeptide(tripeptide_ready_data, significant_tripeptides)

### Finalised AA vectorisation

In [None]:
post_tripeptide_df['amino_acid_vector'] = pd.NA

In [None]:
def getAA_vector(df):

    vectorised_df = pd.DataFrame()
    # Iterate over rows in the data frame
    uniprot_ids = sorted(set(df.loc[:,'ID']))
    for uniprot_id in uniprot_ids:
        uniprot_df = df[df['ID'] == uniprot_id]
        residue_numbers = uniprot_df['residue_number'].values
        for i in uniprot_df.index:
    # Positively Charged AAs
    # Arginine - Pos 1
            if uniprot_df.loc[i, 'amino_acid'] == 'R':
                i_list = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Histidine - Pos 2
            elif uniprot_df.loc[i, 'amino_acid'] == 'H':
                i_list = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector
                
    # Lysine - Pos 3
            elif uniprot_df.loc[i, 'amino_acid'] == 'K':
                i_list = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Negatively Charged AAs
    # Aspartic Acid - Pos 4
            elif uniprot_df.loc[i, 'amino_acid'] == 'D':
                i_list = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Glutamic Acid - Pos 5
            elif uniprot_df.loc[i, 'amino_acid'] == 'E':
                i_list = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Polar Uncharged AAs
    # Serine - Pos 6
            elif uniprot_df.loc[i, 'amino_acid'] == 'S':
                i_list = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Threonine - Pos 7 
            elif uniprot_df.loc[i, 'amino_acid'] == 'T':
                i_list = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Asparagine  - Pos 8 
            elif uniprot_df.loc[i, 'amino_acid'] == 'N':
                i_list = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Glutamine - Pos 9 
            elif uniprot_df.loc[i, 'amino_acid'] == 'Q':
                i_list = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Special cases - come back to this
    # Glycine - Pos 10
            elif uniprot_df.loc[i, 'amino_acid'] == 'G':
                i_list = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Proline - Pos 11
            elif uniprot_df.loc[i, 'amino_acid'] == 'P':
                i_list = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Cysteine - Pos 12
            elif uniprot_df.loc[i, 'amino_acid'] == 'C':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Hydrophobic AAs
    # Alanine - Pos 13
            elif uniprot_df.loc[i, 'amino_acid'] == 'A':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Valine - Pos 14
            elif uniprot_df.loc[i, 'amino_acid'] == 'V':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Isoleucine - Pos 15
            elif uniprot_df.loc[i, 'amino_acid'] == 'I':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Leucine - Pos 16
            elif uniprot_df.loc[i, 'amino_acid'] == 'L':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Methionine - Pos 17
            elif uniprot_df.loc[i, 'amino_acid'] == 'M':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Phenylalanine - Pos 18
            elif uniprot_df.loc[i, 'amino_acid'] == 'F':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector
        
    # Tyrosine - Pos 19
            elif uniprot_df.loc[i, 'amino_acid'] == 'Y':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Tryptophan - Pos 20
            elif uniprot_df.loc[i, 'amino_acid'] == 'W':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector

    # Empty Position - Pos Null
            elif uniprot_df.loc[i, 'amino_acid'] == '-':
                i_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                i_vector = np.array(i_list)
                uniprot_df.at[i, 'amino_acid_vector'] = i_vector


            else:
                print("Mahoraga HELP ME")

        vectorised_df = pd.concat([vectorised_df ,uniprot_df], ignore_index=True)
    return vectorised_df

In [None]:
aa_vectorised_df = getAA_vector(post_tripeptide_df)

In [None]:
aa_vectorised_df['binding_ground_truth'] = pd.NA

In [None]:
def getGT_column(df):

    GT_df = pd.DataFrame()
    # Iterate over rows in the data frame
    uniprot_ids = sorted(set(df.loc[:,'ID']))
    for uniprot_id in uniprot_ids:
        uniprot_df = df[df['ID'] == uniprot_id]
        residue_numbers = uniprot_df['residue_number'].values
        for i in uniprot_df.index:
            if uniprot_df.loc[i, 'Distance_to_RNA'] > 4.2:
                uniprot_df.at[i, 'binding_ground_truth'] = [0]

            elif uniprot_df.loc[i, 'Distance_to_RNA'] <= 4.2:
                uniprot_df.at[i, 'binding_ground_truth'] = [1]

            else:
                sys.stderr.write(f"Non-Integer Input within Distance_to_RNA column")

        GT_df = pd.concat([GT_df, uniprot_df], ignore_index=True)

    return GT_df
            

In [None]:
finalised_gt_df = getGT_column(aa_vectorised_df)

In [None]:
def training_data_preprocess(df, implicated_tripeptide_list):
    req_columns = ["No", "ID", 'pdb_id', 'residue_number', 'amino_acid', 'Distance_to_RNA']

    tripeptide_ready_data = df[req_columns]
    tripeptide_ready_data['tripeptide_significance'] = pd.NA
    tripeptide_ready_data['amino_acid_vector'] = pd.NA
    tripeptide_ready_data['binding_ground_truth'] = pd.NA

    post_tripeptide_df = getTestTripeptide(tripeptide_ready_data, implicated_tripeptide_list)

    aa_vectorised_df = getAA_vector(post_tripeptide_df)

    finalised_gt_df = getGT_column(aa_vectorised_df)

    resorted_finalsed_df = finalised_gt_df.sort_values(by='No', ascending = True)

    final_columns = ['ID', 'tripeptide_significance', 'amino_acid_vector', 'binding_ground_truth']
    
    model_input_final = resorted_finalsed_df[final_columns]

    return model_input_final

In [None]:
completed_input_df = training_data_preprocess(training_data, significant_tripeptides)

In [None]:
print(completed_input_df)