In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from Bio import SeqIO
import os
from collections import Counter



In [2]:
SQ =[]
ID = []
for record in SeqIO.parse("uniprot-Humanproteome%3AUP000005640+reviewed%3Ayes.fasta", "fasta"):
    SQ.append(record.seq)
    ID.append('> '+record.id) 

In [3]:
from collections import Counter
AA = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

total = sum([len(SQ[i]) for i in range(len(SQ))])
print('AA total:',total)

AA_freq = []
for key in AA:
    a = 0
    for i in range(len(SQ)):
        counter = Counter(list(SQ[i]))
        a+=counter[key]
    
    AA_freq.append(a/total)
    print(key,' freq:',a/total)



AA total: 11367425
A  freq: 0.07009846117304491
C  freq: 0.02300617774034137
D  freq: 0.047393670950105236
E  freq: 0.07104757673791558
F  freq: 0.03649771166293158
G  freq: 0.06571470671678062
H  freq: 0.026222033574006428
I  freq: 0.0433837038731287
K  freq: 0.05733523643217351
L  freq: 0.0996521199831976
M  freq: 0.02131810854261189
N  freq: 0.035888514769175955
P  freq: 0.06314860225600785
Q  freq: 0.04767913577613224
R  freq: 0.05635031680437742
S  freq: 0.08331535066208925
T  freq: 0.0535221477159515
V  freq: 0.059646050006927694
W  freq: 0.01214408716134041
Y  freq: 0.026633120517619426


In [4]:
AA_freq_df = pd.DataFrame({'Frequency':AA_freq},index=AA)
AA_freq_df

Unnamed: 0,Frequency
A,0.070098
C,0.023006
D,0.047394
E,0.071048
F,0.036498
G,0.065715
H,0.026222
I,0.043384
K,0.057335
L,0.099652


In [5]:
AA_freq_df.to_csv('AA_frequency_human.csv')

## Methionine (M) Correction 

In [18]:
SQ =[]
ID = []
for record in SeqIO.parse("uniprot-Humanproteome%3AUP000005640+reviewed%3Ayes.fasta", "fasta"):
    SQ.append(record.seq[1:])
    ID.append('> '+record.id) 
    
    
AA = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

total = sum([len(SQ[i]) for i in range(len(SQ))])
print('AA total:',total)

AA_freq = []
for key in AA:
    a = 0
    for i in range(len(SQ)):
        counter = Counter(list(SQ[i]))
        a+=counter[key]
    
    AA_freq.append(a/total)
    print(key,' freq:',a/total)
    
    
AA_freq_df = pd.DataFrame({'Frequency':AA_freq},index=AA)
AA_freq_df.to_csv('AA_frequency_human_Methionine.csv')
AA_freq_df

AA total: 11347064
A  freq: 0.0702233635061898
C  freq: 0.02304745967767521
D  freq: 0.04747836092226148
E  freq: 0.07117497530638763
F  freq: 0.03656320260465615
G  freq: 0.06583183103576397
H  freq: 0.026269085994403488
I  freq: 0.04346146280659032
K  freq: 0.05743811791314476
L  freq: 0.09983093423990558
M  freq: 0.019567793043204832
N  freq: 0.03595167877787593
P  freq: 0.06326182702415356
Q  freq: 0.04776460236762567
R  freq: 0.056451342831943134
S  freq: 0.08346414543885537
T  freq: 0.053618010791161484
V  freq: 0.0597530779768229
W  freq: 0.012164732656835283
Y  freq: 0.02668082245768597


Unnamed: 0,Frequency
A,0.070223
C,0.023047
D,0.047478
E,0.071175
F,0.036563
G,0.065832
H,0.026269
I,0.043461
K,0.057438
L,0.099831


In [5]:
# Folder Path
path = "/Users/danieltadros/Desktop/PhD/Chapter_2/data/Peptides_all/class1_9"
  
# Change the directory
os.chdir(path)
  
    
# iterate through all file

file_path = []
files = []
for file in os.listdir():
    # Check whether file is in text format or not
    if file.endswith(".txt"):
        files.append(file)
        file_path.append(f"{path}/{file}")

        
files = sorted(files)
file_path = sorted(file_path)

Alleles = [files[i][0:-4] for i in range(len(files))]
Alleles_H = Alleles[:-8]
Alleles_M = Alleles[-8:]
Alleles_M,len(Alleles_M)

(['H2-Db', 'H2-Dd', 'H2-Dq', 'H2-Kb', 'H2-Kd', 'H2-Kk', 'H2-Kq', 'H2-Ld'], 8)

In [6]:
AAs=['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

PWM_o = []
PWM_c = []
for q in Alleles_H:

    PWM = pd.read_csv('/Users/danieltadros/Desktop/PhD/Chapter_2/data/Peptides_all/class1_9/PWMs/2_PWMs_blosum/PWM_'+q+'.csv')
    PWM_o.append(PWM)
    PWM_p = PWM.copy()
    
    for i in range(20):
        PWM_p.iloc[i,1:] = PWM_p.iloc[i,1:]/AA_freq[i]
        
    for i in range(9):
        PWM_p[str(i+1)] = PWM_p[str(i+1)].to_numpy()/sum(PWM_p[str(i+1)].to_numpy())   
        
    PWM_c.append(PWM_p)
    
    
    
    PWM_p.to_csv('/Users/danieltadros/Desktop/PhD/Chapter_2/data/Peptides_all/class1_9/PWMs/4_PWMs_corr/PWM_'+q+'.csv',index=False)
    
    
    
    
    

# Mouse AA frequency

In [12]:
SQ =[]
ID = []
for record in SeqIO.parse("/Users/danieltadros/Desktop/PhD/Chapter_2/data/proteome/uniprot-Mouseproteome%3AUP000000589+reviewed%3Ayes.fasta", "fasta"):
    SQ.append(record.seq)
    ID.append('> '+record.id) 

In [15]:
SQ[1]

Seq('MAQAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDADAAPLGAAPTPGIFSFQPE...GHK')

In [8]:
from collections import Counter
AA = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

total = sum([len(SQ[i]) for i in range(len(SQ))])
print('AA total:',total)

AA_freq = []
for key in AA:
    a = 0
    for i in range(len(SQ)):
        counter = Counter(list(SQ[i]))
        a+=counter[key]
    
    AA_freq.append(a/total)
    print(key,' freq:',a/total)

AA total: 9689333
A  freq: 0.06993432881293274
C  freq: 0.02221700915842195
D  freq: 0.048823278134831366
E  freq: 0.07045025699911439
F  freq: 0.03672585099510978
G  freq: 0.065101798028822
H  freq: 0.025623951617722292
I  freq: 0.043092646315283
K  freq: 0.056516274133627156
L  freq: 0.10046222995948224
M  freq: 0.02163461612889143
N  freq: 0.035629697111245945
P  freq: 0.0619610245617526
Q  freq: 0.04761968651505733
R  freq: 0.05628075740610835
S  freq: 0.08411672919075028
T  freq: 0.053535160779384916
V  freq: 0.06121793935661
W  freq: 0.012180095368793704
Y  freq: 0.026865626354259885


In [9]:
AA_freq_df = pd.DataFrame({'Frequency':AA_freq},index=AA)
AA_freq_df

Unnamed: 0,Frequency
A,0.069934
C,0.022217
D,0.048823
E,0.07045
F,0.036726
G,0.065102
H,0.025624
I,0.043093
K,0.056516
L,0.100462


In [11]:
AA_freq_df.to_csv('/Users/danieltadros/Desktop/PhD/Chapter_2/data/proteome/AA_frequency_Mouse.csv')

## Methionine (M) Correction 

In [19]:
SQ =[]
ID = []
for record in SeqIO.parse("/Users/danieltadros/Desktop/PhD/Chapter_2/data/proteome/uniprot-Mouseproteome%3AUP000000589+reviewed%3Ayes.fasta", "fasta"):

    SQ.append(record.seq[1:])
    ID.append('> '+record.id) 
    
    
AA = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

total = sum([len(SQ[i]) for i in range(len(SQ))])
print('AA total:',total)

AA_freq = []
for key in AA:
    a = 0
    for i in range(len(SQ)):
        counter = Counter(list(SQ[i]))
        a+=counter[key]
    
    AA_freq.append(a/total)
    print(key,' freq:',a/total)
    
    
AA_freq_df = pd.DataFrame({'Frequency':AA_freq},index=AA)
AA_freq_df.to_csv('AA_frequency_Mouse_Methionine.csv')
AA_freq_df

AA total: 9672231
A  freq: 0.07005725979869587
C  freq: 0.022256292265972557
D  freq: 0.04890495274564886
E  freq: 0.07057006806392445
F  freq: 0.036790787978492244
G  freq: 0.06521659790796973
H  freq: 0.025669051948821322
I  freq: 0.043168633999746284
K  freq: 0.05661579009020773
L  freq: 0.10063965593873844
M  freq: 0.019919602829998582
N  freq: 0.03569248914754
P  freq: 0.06206996090147144
Q  freq: 0.047702851596493095
R  freq: 0.05637985693269733
S  freq: 0.08426525379718496
T  freq: 0.05362971583288282
V  freq: 0.061325975361837405
W  freq: 0.012201321494492842
Y  freq: 0.02691312893581636


Unnamed: 0,Frequency
A,0.070057
C,0.022256
D,0.048905
E,0.07057
F,0.036791
G,0.065217
H,0.025669
I,0.043169
K,0.056616
L,0.10064


In [10]:
AAs=['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

PWM_o = []
PWM_c = []
for q in Alleles_M:

    PWM = pd.read_csv('/Users/danieltadros/Desktop/PhD/Chapter_2/data/Peptides_all/class1_9/PWMs/2_PWMs_blosum/PWM_'+q+'.csv')
    PWM_o.append(PWM)
    PWM_p = PWM.copy()
    
    for i in range(20):
        PWM_p.iloc[i,1:] = PWM_p.iloc[i,1:]/AA_freq[i]
        
    for i in range(9):
        PWM_p[str(i+1)] = PWM_p[str(i+1)].to_numpy()/sum(PWM_p[str(i+1)].to_numpy())   
        
    PWM_c.append(PWM_p)
    
    
    
    PWM_p.to_csv('/Users/danieltadros/Desktop/PhD/Chapter_2/data/Peptides_all/class1_9/PWMs/4_PWMs_corr/PWM_'+q+'.csv',index=False)
    
    
    
    

    

In [11]:
len()

SyntaxError: unexpected EOF while parsing (3256977527.py, line 1)

In [16]:
def Freq_Corr(Alleles,Alleles_H,AA_Human_freq,AA_Mouse_freq,PWMs):
    PWM_c = []
    for q in range(len(Alleles)):
        if Alleles[q] in Alleles_H:
            PWM = PWMs[q]
            
            for i in range(20):
                PWM.iloc[i,:] = PWM.iloc[i,:]/AA_Human_freq[i]

            for i in range(9):
                PWM[str(i+1)] = PWM[str(i+1)].to_numpy()/sum(PWM[str(i+1)].to_numpy())   

            PWM_c.append(PWM)
            
        else:
            PWM = PWMs[q]
            
            for i in range(20):
                PWM.iloc[i,:] = PWM.iloc[i,:]/AA_Mouse_freq[i]

            for i in range(9):
                PWM[str(i+1)] = PWM[str(i+1)].to_numpy()/sum(PWM[str(i+1)].to_numpy())   

            PWM_c.append(PWM)
            
    return PWM_c
            
            

In [8]:
AAs=['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']


test = pd.DataFrame({'numbers': [i for i in range(len(AAs))],'numbers2': [i+2 for i in range(len(AAs))],'numbers3': [i+5 for i in range(len(AAs))]})
test.index = AAs

In [9]:
test

Unnamed: 0,numbers,numbers2,numbers3
A,0,2,5
C,1,3,6
D,2,4,7
E,3,5,8
F,4,6,9
G,5,7,10
H,6,8,11
I,7,9,12
K,8,10,13
L,9,11,14


numbers     0.5
numbers2    1.5
numbers3    3.0
Name: C, dtype: float64