In [2]:
import jaxlib
from jax_unirep import get_reps
import pandas as pd
import os
import numpy as np

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')



In [3]:
# Importing secreted, cytoplasmic, and transmembrane proteins for human proteome
secreted = pd.read_csv('secreted.csv')
cytoplasm = pd.read_csv('cytoplasm.csv')
transmembrane = pd.read_csv('transmembrane.csv')

# Adding the class
secreted['Class'] = 'secreted'
cytoplasm['Class'] = 'cytoplasm'
transmembrane['Class'] = 'membrane'

In [4]:
df = pd.read_pickle('combined_human_UniRep_dataset.pkl')
df.head()

Unnamed: 0,ID,name,organism,location,seq,seq_len,UniRep
0,Q96PX8,SLIK1_HUMAN,human,secreted,MLLWILLLETSLCFAAGNVTGDVCKEKICSCNEIEGDLHVDCEKKG...,696,"[0.004320841282606125, -0.16935111582279205, 0..."
1,Q9BQ16,TICN3_HUMAN,human,secreted,MLKVSAVLCVCAAAWCSQSLAAAAAVAAAGGRSDGGNFLDDKQWLT...,436,"[0.003699700115248561, -0.170448899269104, 0.0..."
2,Q9H0E2,TOLIP_HUMAN,human,secreted,MATTVSTQRGPVYIGELPQDFLRITPTQQQRQVQLDAQAAQQLQYG...,274,"[0.006434731185436249, -0.030111152678728104, ..."
3,P62328,TYB4_HUMAN,human,secreted,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,44,"[0.014509799890220165, -0.10900980979204178, 0..."
4,Q9UBV4,WNT16_HUMAN,human,secreted,MDRAALLGLARLCALWAALLVLFPYGAQGNWMWLGIASFGVPEKLG...,365,"[0.003950728569179773, -0.02248261496424675, 0..."


In [52]:
def add_unirep_vector(unirep_df, source_df):
    vec_prefix = 'Vec_{}'
    vec_column_names = [vec_prefix.format(i) for i in range(1900)]        
    output_df = pd.DataFrame(columns = ['ID'] + ['Class'] + ['Sequence'] + vec_column_names)
    
    classification = source_df.loc[0, 'Class']
    print(classification)
    for i, row in source_df.iterrows():
        if int(i) % 1 == 0:
            print('row {}'.format(i), end='\r')
            
        uniprotid = row.loc[id_key]
        filtered = unirep_df[(unirep_df['ID']==uniprotid) & (unirep_df['location']==classification)]
        if filtered.empty == True:
            continue
        output_df.loc[i, 'ID'] = row['Entry']
        output_df.loc[i, 'Sequence'] = row['Sequence']
        vec = np.array(filtered['UniRep'])[0]
        output_df.loc[i, vec_column_names] = vec
    output_df['Class'] = classification

    return output_df

In [53]:
unirep_secreted = add_unirep_vector(df, secreted)

secreted
row 1675

In [54]:
unirep_cytoplasm = add_unirep_vector(df, cytoplasm)

cytoplasm
row 4820

In [55]:
unirep_transmembrane = add_unirep_vector(df, transmembrane)

membrane
row 1475

In [42]:
unirep_secreted.head()

Unnamed: 0,ID,Class,Sequence,Vec_0,Vec_1,Vec_2,Vec_3,Vec_4,Vec_5,Vec_6,...,Vec_1890,Vec_1891,Vec_1892,Vec_1893,Vec_1894,Vec_1895,Vec_1896,Vec_1897,Vec_1898,Vec_1899
0,Q8IWL1,secreted,MWLCPLALNLILMAASGAACEVKDVCVGSPGIPGTPGSHGLPGRDG...,0.001589,-0.088675,0.072082,-0.00778,-0.405418,0.004649,-0.142137,...,0.041924,0.071974,0.0465,0.009601,-0.2152,0.076984,0.037502,0.033136,0.13122,0.164174
1,Q8IWL2,secreted,MWLCPLALNLILMAASGAVCEVKDVCVGSPGIPGTPGSHGLPGRDG...,0.001504,-0.09187,0.073,-0.007296,-0.410786,0.007046,-0.139491,...,0.040061,0.070792,0.044514,0.007404,-0.211015,0.077411,0.037028,0.038521,0.115775,0.152708
2,P35247,secreted,MLLFLLSALVLLTQPLGYLEAEMKTYSHRTMPSACTLVMCSSVESG...,-0.005619,0.155841,0.061187,-0.011045,-0.292977,0.001631,-0.166412,...,0.046157,0.061953,0.041291,0.000301,-0.183375,0.066759,0.061715,0.017694,0.341299,0.159945
3,Q9BQ16,secreted,MLKVSAVLCVCAAAWCSQSLAAAAAVAAAGGRSDGGNFLDDKQWLT...,0.0037,-0.170449,0.070491,-0.015611,-0.260699,-0.001906,-0.065601,...,-0.000768,-0.003258,0.042792,0.00599,-0.077984,0.02641,0.061212,-0.009403,-0.055176,-0.015667
4,Q9UBV4,secreted,MDRAALLGLARLCALWAALLVLFPYGAQGNWMWLGIASFGVPEKLG...,0.003951,-0.022483,0.093345,-0.027903,-0.014826,0.063857,-0.009284,...,0.000366,-0.009532,-0.066172,-0.020427,-0.14753,0.080772,0.15546,-0.113068,-0.084242,0.020746


In [56]:
human_unirep_part = pd.concat([unirep_secreted, unirep_cytoplasm, unirep_transmembrane])
len(human_unirep_part.index)
human_unirep_part.to_csv('human_testing_data.csv')