In [1]:
import numpy as np
import pandas as pd
import functions

### pET data

In [18]:
#classes for pET15 and 21
classes = pd.read_csv('datasets_used/class.txt', sep='\t')

#pET15 
pET15_ =  pd.read_csv('datasets_used/pET15_NESG.txt', sep='\t', header=None)

pET15_.rename(columns={0:'Accession', 4:'Sequence', 6:'Type'}, inplace=True)
pET15_['Sequence'] = pET15_['Sequence'].apply(lambda x:'ATGGGCCATCACCATCACCATCACAGCCAT' + x) #His tag
#copy sequence and accession
pET15_seq = pET15_[['Accession', 'Sequence']].copy()
pET15 = pd.merge(pET15_seq, classes, on="Accession").dropna()
pET15 = pET15.drop_duplicates(keep=False, inplace=False).copy()
pET15['Accession'] = pET15['Accession'] + '_pET15'

#pET21
pET21_ =  functions.fasta_reader('datasets_used/pET21_NESG.fas')

pET21 = pd.merge(pET21_, classes, on="Accession").dropna()
pET21 = pET21.drop_duplicates(keep=False, inplace=False).copy()
pET21['Accession'] = pET21['Accession'] + '_pET21'

pET_merged = pd.concat([pET15, pET21], sort=False, keys=['pET15', 'pET21']) #keys make easier to reference later
#check for unknown bases
pET_merged['unknown_bases'] = pET_merged['Sequence'].apply(lambda x: 'X' in x or 'Z' in x or '*' in x or 'N' in x)
pET = pET_merged.loc[pET_merged['unknown_bases'] == False].drop('unknown_bases', 1)

#Solubility
solubility_test = pET.loc[pET['Class'] != 0].copy()
solubility_test['Solubility'] = solubility_test['Class'].replace(1,0)
solubility_test['Solubility'] = solubility_test['Solubility'].replace(2,1)

solubility_test['Protein'] = solubility_test['Sequence'].apply(functions.translate)

#remove seq with premature stop codons
solubility_test['stop'] = solubility_test['Protein'].apply(lambda x: 'stop' in x)
solubility_ = solubility_test.loc[solubility_test['stop'] == False].drop('stop', 1)


final_df = solubility_.drop_duplicates(keep=False, inplace=False).copy()
final_df.to_pickle("results/pET_complete.pkl.gz", compression='gzip')
final_df.shape

(12216, 5)

In [30]:
final_df.head()

Unnamed: 0,Unnamed: 1,Accession,Sequence,Class,Solubility,Protein,Accession_fasta
pET15,2,BbCD00584211_pET15,ATGGGCCATCACCATCACCATCACAGCCATATGATTTTTGTAACTA...,2,1,MIFVTKLNGDGYYLNPYHIESIEANPDTTILLMNGKKLIVKEKVEE...,>BbCD00584211_pET15
pET15,3,BcCD00331746_pET15,ATGGGCCATCACCATCACCATCACAGCCATATGGATAAAGAGAATC...,1,0,MDKENPRVGDKYITVQKVGKKIFEAEVEILEYDAPHIISLGSEMKQ...,>BcCD00331746_pET15
pET15,4,BcCD00331747_pET15,ATGGGCCATCACCATCACCATCACAGCCATATGGCACATACTACTA...,2,1,MAHTTTSMEIFGSPEQVWQLIGGFNSLPDWLPYIPSSKLTEGGRVR...,>BcCD00331747_pET15
pET15,5,BcCD00341383_pET15,ATGGGCCATCACCATCACCATCACAGCCATATGGATAAAGAGAATC...,1,0,MDKENPRVGDKYITVQKVGKKIFEAEVEILEYDAPHIISLGSEMKQ...,>BcCD00341383_pET15
pET15,6,BcCD00591009_pET15,ATGGGCCATCACCATCACCATCACAGCCATATGAGTTTGAAGGGGA...,2,1,MSLKGKRIGFGFTGSHCTYEEVMPHLEKLIAEGAEVRPVVSYTVQS...,>BcCD00591009_pET15


### Making fasta for clustering


In [21]:
#Fasta for clustering and cross validation

final_df = pd.read_pickle('results/pET_complete.pkl.gz', compression='infer')
final_df['Accession_fasta'] = '>' + final_df['Accession']
final_df['Protein']['pET15'] = final_df['Protein']['pET15'].apply(lambda x:x[10:])
final_df['Protein']['pET21'] = final_df['Protein']['pET21'].apply(lambda x:x[:-8])
final_df.to_csv('results/pET_full_without_his_tag.fa', columns=['Accession_fasta', 'Protein'],\
                index=None, sep='\n', header=None)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### eSOL data

In [24]:
#ecoli dataset

ecol = functions.fasta_reader('datasets_used/ecoli_W3110.faa')
ecol['ECK number'] = ecol['Accession'].apply(lambda x:x.split(' ')[2])
ecol['JW'] = ecol['Accession'].apply(lambda x:x.split(' ')[0])

#get solubility
sol_values = pd.read_csv('datasets_used/all_data_esol.tab', sep='\t')
sol_values.head()
sol = sol_values[['ECK number', 'Solubility(%)']]

#merge with solubility
ecol_merg = ecol.merge(sol, on='ECK number', how='left').dropna()

#filter unknown bases
ecol_merg['unknown_bases'] = ecol_merg['Sequence'].apply(lambda x: 'X' in x or 'Z' in x or '*' in x )
ecoli_w3110 = ecol_merg.loc[ecol_merg['unknown_bases'] == False]

#add tags
#https://www.pnas.org/content/106/11/4201.long

ecoli_w3110['Sequence'] = ecoli_w3110['Sequence'].apply(lambda x: 'MRGSHHHHHHTDPALRA' + x + 'GLCGR')
ecoli_w3110 =  ecoli_w3110.dropna()

ecoli_w3110 = ecoli_w3110[['Accession', 'Sequence', 'ECK number', 'JW', 'Solubility(%)']].copy()

ecoli_w3110.to_pickle('results/ecoli.pkl.gz', compression='gzip')
ecoli_w3110.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Accession,Sequence,ECK number,JW,Solubility(%)
2,JW0002 thrB ECK0003,MRGSHHHHHHTDPALRAMVKVYAPASSANMSVGFDVLGAAVTPVDG...,ECK0003,JW0002,32.0
3,JW0003 thrC ECK0004,MRGSHHHHHHTDPALRAMKLYNLKDHNEQVSFAQAVTQGLGKNQGL...,ECK0004,JW0003,18.0
4,JW0004 yaaX ECK0005,MRGSHHHHHHTDPALRAMKKMQSIVLALSLVLVAPMAAQAAEITLV...,ECK0005,JW0004,78.0
5,JW0005 yaaA ECK0006,MRGSHHHHHHTDPALRAMLILISPAKTLDYQSPLTTTRYTLPELLD...,ECK0006,JW0005,7.0
7,JW0007 talB ECK0008,MRGSHHHHHHTDPALRAMTDKLTSLRQYTTVVADTGDIAAMKLYQP...,ECK0008,JW0007,85.0


In [33]:
aaa = 'MRGSHHHHHHTDPALRA'


'GSHHHHHHTD'