In [2]:
import jaxlib
from jax_unirep import get_reps
import pandas as pd
import os
import numpy as np

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')



In [4]:
# Importing secreted, cytoplasmic, and transmembrane proteins for human proteome
secreted = pd.read_csv('secreted.csv')
cytoplasm = pd.read_csv('cytoplasm.csv')
transmembrane = pd.read_csv('transmembrane.csv')

# Adding the class
secreted['Class'] = 'secreted'
cytoplasm['Class'] = 'cytoplasm'
transmembrane['Class'] = 'membrane'

In [8]:
df = pd.read_pickle('combined_human_UniRep_dataset.pkl')
df.head()

Unnamed: 0,ID,name,organism,location,seq,seq_len,UniRep
0,Q96PX8,SLIK1_HUMAN,human,secreted,MLLWILLLETSLCFAAGNVTGDVCKEKICSCNEIEGDLHVDCEKKG...,696,"[0.004320841282606125, -0.16935111582279205, 0..."
1,Q9BQ16,TICN3_HUMAN,human,secreted,MLKVSAVLCVCAAAWCSQSLAAAAAVAAAGGRSDGGNFLDDKQWLT...,436,"[0.003699700115248561, -0.170448899269104, 0.0..."
2,Q9H0E2,TOLIP_HUMAN,human,secreted,MATTVSTQRGPVYIGELPQDFLRITPTQQQRQVQLDAQAAQQLQYG...,274,"[0.006434731185436249, -0.030111152678728104, ..."
3,P62328,TYB4_HUMAN,human,secreted,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,44,"[0.014509799890220165, -0.10900980979204178, 0..."
4,Q9UBV4,WNT16_HUMAN,human,secreted,MDRAALLGLARLCALWAALLVLFPYGAQGNWMWLGIASFGVPEKLG...,365,"[0.003950728569179773, -0.02248261496424675, 0..."


In [34]:
id_key = 'Entry'
vec_prefix = 'Vec_{}'
# Generating new data frame with unirep vectors
vec_column_names = [vec_prefix.format(i) for i in range(1900)]        
unirep_secreted = pd.DataFrame(columns = ['ID'] + ['Class'] + vec_column_names)

for i, row in secreted.iterrows():
    if int(i) % 1 == 0:
        print('row {}'.format(i), end='\r')
    uniprotid = row.loc[id_key]
    filtered = df[(df['ID']==uniprotid) & (df['location']=='secreted')]
    if filtered.empty == True:
        continue
    unirep_secreted.loc[i, 'Class'] = row['Class']
    unirep_secreted.loc[i, 'ID'] = row['Entry']
    vec = np.array(filtered['UniRep'])
#     print(vec[0])
    for j in range(len(vec_column_names)):
        unirep_secreted.loc[i, vec_column_names[j]] = vec[0][j]
    
unirep_secreted.head()
#     break
    
#     return unirep_df

row 1675

Unnamed: 0,ID,Class,Vec_0,Vec_1,Vec_2,Vec_3,Vec_4,Vec_5,Vec_6,Vec_7,...,Vec_1890,Vec_1891,Vec_1892,Vec_1893,Vec_1894,Vec_1895,Vec_1896,Vec_1897,Vec_1898,Vec_1899
0,Q8IWL1,secreted,0.001589,-0.088675,0.072082,-0.00778,-0.405418,0.004649,-0.142137,-0.04666,...,0.041924,0.071974,0.0465,0.009601,-0.2152,0.076984,0.037502,0.033136,0.13122,0.164174
1,Q8IWL2,secreted,0.001504,-0.09187,0.073,-0.007296,-0.410786,0.007046,-0.139491,-0.04483,...,0.040061,0.070792,0.044514,0.007404,-0.211015,0.077411,0.037028,0.038521,0.115775,0.152708
2,P35247,secreted,-0.005619,0.155841,0.061187,-0.011045,-0.292977,0.001631,-0.166412,-0.047132,...,0.046157,0.061953,0.041291,0.000301,-0.183375,0.066759,0.061715,0.017694,0.341299,0.159945
3,Q9BQ16,secreted,0.0037,-0.170449,0.070491,-0.015611,-0.260699,-0.001906,-0.065601,-0.026869,...,-0.000768,-0.003258,0.042792,0.00599,-0.077984,0.02641,0.061212,-0.009403,-0.055176,-0.015667
4,Q9UBV4,secreted,0.003951,-0.022483,0.093345,-0.027903,-0.014826,0.063857,-0.009284,-0.064604,...,0.000366,-0.009532,-0.066172,-0.020427,-0.14753,0.080772,0.15546,-0.113068,-0.084242,0.020746


In [37]:
unirep_cytoplasm = pd.DataFrame(columns = ['ID'] + ['Class'] + vec_column_names)

for i, row in cytoplasm.iterrows():
    if int(i) % 1 == 0:
        print('row {}'.format(i), end='\r')
    uniprotid = row.loc[id_key]
    filtered = df[(df['ID']==uniprotid) & (df['location']=='cytoplasm')]
    if filtered.empty == True:
        continue
    unirep_cytoplasm.loc[i, 'Class'] = row['Class']
    unirep_cytoplasm.loc[i, 'ID'] = row['Entry']
    vec = np.array(filtered['UniRep'])
#     print(vec[0])
    for j in range(len(vec_column_names)):
        unirep_cytoplasm.loc[i, vec_column_names[j]] = vec[0][j]
    
unirep_cytoplasm.head()

row 4820

Unnamed: 0,ID,Class,Vec_0,Vec_1,Vec_2,Vec_3,Vec_4,Vec_5,Vec_6,Vec_7,...,Vec_1890,Vec_1891,Vec_1892,Vec_1893,Vec_1894,Vec_1895,Vec_1896,Vec_1897,Vec_1898,Vec_1899
4,A0MZ66,cytoplasm,0.008685,-0.031609,0.037371,-0.006343,-0.145975,-0.019995,-0.071183,-0.00738,...,0.063738,0.014633,-0.017102,0.038875,-0.039739,-0.058235,0.041217,-0.008407,0.16155,0.080602
6,Q6ZVD7,cytoplasm,0.005687,-0.046836,0.0252,-0.008284,-0.050151,0.0176,-0.378956,-0.018484,...,0.031221,0.066671,-0.099216,0.03595,-0.033755,-0.043766,0.052715,0.001847,0.249152,0.030248
7,Q9NZ72,cytoplasm,0.005726,-0.045255,0.026491,-0.006399,0.01086,-0.006465,0.056769,-0.028981,...,0.052808,0.051692,-0.105064,0.040413,-0.065614,-0.012532,0.118546,-0.012773,0.065826,-0.00384
10,Q9BQE3,cytoplasm,0.004798,0.054965,0.141611,-0.011815,-0.232411,0.030892,0.013953,-0.042677,...,0.015042,0.029746,-0.098209,0.101909,-0.115275,0.120788,-0.005355,-0.598971,-0.060351,0.146853
12,Q9H0E2,cytoplasm,0.006435,-0.030111,0.027193,-0.001028,0.110938,0.016725,-0.276211,-0.016337,...,0.059971,0.148068,-0.13021,0.038499,-0.122797,0.037896,0.09022,0.020691,0.160191,-0.034112


In [38]:
unirep_transmembrane = pd.DataFrame(columns = ['ID'] + ['Class'] + vec_column_names)

for i, row in transmembrane.iterrows():
    if int(i) % 1 == 0:
        print('row {}'.format(i), end='\r')
    uniprotid = row.loc[id_key]
    filtered = df[(df['ID']==uniprotid) & (df['location']=='membrane')]
    if filtered.empty == True:
        continue
    unirep_transmembrane.loc[i, 'Class'] = row['Class']
    unirep_transmembrane.loc[i, 'ID'] = row['Entry']
    vec = np.array(filtered['UniRep'])
#     print(vec[0])
    for j in range(len(vec_column_names)):
        unirep_transmembrane.loc[i, vec_column_names[j]] = vec[0][j]
    
unirep_transmembrane.head()

row 1475

Unnamed: 0,ID,Class,Vec_0,Vec_1,Vec_2,Vec_3,Vec_4,Vec_5,Vec_6,Vec_7,...,Vec_1890,Vec_1891,Vec_1892,Vec_1893,Vec_1894,Vec_1895,Vec_1896,Vec_1897,Vec_1898,Vec_1899
0,Q6ZRP7,membrane,0.003171,0.019411,0.086529,-0.011284,0.062487,0.017807,-0.072231,-0.025005,...,0.002591,0.044981,-0.084792,0.040491,-0.072879,0.040209,0.095638,-0.005442,-0.118015,0.090927
5,P43629,membrane,0.010063,-0.117323,0.113134,-0.001146,-0.351542,0.013152,-0.150625,0.007325,...,0.035266,-0.021426,0.061176,0.027453,-0.044761,-0.015849,-0.125025,0.101305,0.052323,0.055527
6,Q68D85,membrane,0.005351,-0.129567,0.098927,-0.00086,-0.417339,0.004033,-0.227764,-0.017276,...,0.046959,-0.005532,0.029854,0.041634,-0.069794,0.012702,-0.086968,0.091976,0.060564,-0.019211
8,P23469,membrane,0.01088,-0.053088,0.048134,-0.014136,0.028206,0.010817,-0.071357,-0.065103,...,0.021367,0.079362,-0.006473,-0.009364,-0.20894,0.066936,0.10282,0.050409,-0.04965,-0.110688
10,Q68DV7,membrane,0.007554,-0.031791,0.037692,-0.021195,-0.049626,0.025078,-0.408869,-0.042369,...,-0.005876,0.039868,-0.02642,-0.001342,-0.0095,-0.030475,0.046674,0.016381,0.164941,0.097364


In [47]:
human_unirep_part = pd.concat([unirep_secreted, unirep_cytoplasm, unirep_transmembrane])
len(human_unirep_part.index)
human_unirep_part.to_csv('human_testing_data.csv')