## Takes just the fifteen residues of all the positives sequences belonging to the five training sets to build the Position-Specific Weight Matrix.

In [1]:
import pandas as pd
import numpy as np
import math

## Create the function that gives back the W matrix

In [6]:
def get_pswm(training,upstream_position,downstream_position):
    """
    Build a Position Specific Weight Matrix (PSWM) from protein sequences.
    """
    training.reset_index(inplace=True)  # Reset index for convenience and ensure compatibility with the function
    # Extract the subsequence around the cleavage site for each protein

    for index,row in training.iterrows():
        sequence_to_recover=row["Sequence"]
        cleavage_position=int(row["SPEnd"])
        training.loc[index, "Sequence"] = sequence_to_recover[cleavage_position-upstream_position:cleavage_position+downstream_position]
  
    # Initialize PSWM dictionary with one entry per amino acid
    pswm={"A": "", "R": "", "N": "", "D": "", "C": "", "Q": "", "E": "", "G": "", "H": "", "I": "", "L": "", "K": "", "M": "", "F": "", "P": "", "S": "", "T": "", "W": "", "Y": "", "V": ""}
    position=[]

    # Background frequencies from SwissProt
    swiss_frequencies = { "A" : 0.0825 , "R" : 0.0552 , "N" : 0.0406 , "D" : 0.0546 , "C" : 0.0138 , "Q" : 0.0393, "E" : 0.0671, "G" : 0.0707, "H" : 0.0227, "I" : 0.0590 ,"L" : 0.0964, "K" : 0.0579, "M" : 0.0241, "F" : 0.0386, "P" : 0.0474, "S" : 0.0665, "T" : 0.0536, "W" : 0.0110, "Y" : 0.0292 , "V": 0.0685}
   
    # Initialize counts with 1 (pseudocounts)
    for i in range(len(training.loc[1,"Sequence"])):
        position.append(1)
    for key in pswm:
        pswm[key]=position[:]
        
    #update all the counts for each position and aminoacid
    i=0
    for index, row in training.iterrows():
        i=0
        for aa in row["Sequence"]:
            if aa=='X':
                pass
            else:
                pswm[aa][i]+=1
            i+=1
    
    N=len(training) #number of sequences
    
    # Compute log-odds scores for the PSWM
    for key in pswm:
        for i in range(len(pswm[key])):
            pswm[key][i]=math.log((pswm[key][i]/(N+20))/swiss_frequencies[key])
    return pswm

In [7]:
if __name__=='__main__':
    df=pd.read_csv("../Data_Preparation/train_bench.tsv", sep="\t")
    df_pos=df.query("Class=='Positive' and (Set=='1' or Set=='2' or Set=='3' or Set=='4' or Set=='5')")
    print(get_pswm(df_pos,13,2))

{'A': [0.24254599273799143, 0.37208004492989377, 0.2531281020685284, 0.21011071698483755, 0.7554970158162956, 0.7426763273872343, 0.4177500817630821, 0.6192321362961126, 0.6480111008461558, 0.13006800931130125, 1.228680297979411, 0.13006800931130125, 1.7995701624880516, 0.6550782680692484, -0.53954867383845], 'R': [-1.9530033004191132, -3.898913449474427, -2.800301160806317, -2.2894755370403264, -2.512619088354536, -2.800301160806317, -1.259856119859168, -2.2894755370403264, -0.8078709961161109, -0.4649262449892804, -2.1071539802463715, 0.17862399443129262, -1.1908632483722166, -0.2879955368302023, -0.11472381555616558], 'N': [-3.591718562799064, -2.205424201679173, -2.8985713822391186, -1.9822806503649635, -1.512277021119228, -1.799959093571009, -1.2891334698050183, -1.799959093571009, -0.6472795836326234, -1.512277021119228, -1.6458084137437505, -0.5006761094407481, -1.799959093571009, -0.4562243468699143, -0.1577313583139176], 'D': [-2.0962249097141816, -3.194837198382291, -3.887984