In [1]:
import os
import pandas as pd
import numpy as np
np.random.seed(12345)

# os.makedirs(os.path.join(os.getcwd(),'clusters','')) #use this to make clusters folder on this dir

In [2]:
def fasta_reader(file):
    '''Converts .fasta to a pandas dataframe with accession as index
    and sequence in a column 'sequence'
    '''
    fasta_df = pd.read_csv(file, sep='>', lineterminator='>', header=None)
    fasta_df[['Accession', 'Sequence']] = fasta_df[0].str.split('\n', 1, \
                                        expand=True)
    fasta_df['Accession'] = fasta_df['Accession']
    fasta_df['Sequence'] = fasta_df['Sequence'].replace('\n', '', regex=True).\
                            astype(str).str.upper().replace('U', 'T')
    total_seq = fasta_df.shape[0]
    fasta_df.drop(0, axis=1, inplace=True)
    fasta_df = fasta_df[fasta_df.Sequence != '']
    fasta_df = fasta_df[fasta_df.Sequence != 'NONE']
    final_df = fasta_df.dropna()
    remained_seq = final_df.shape[0]
    if total_seq != remained_seq:
        print("{} sequences were removed due to inconsistencies in"
                      "provided file.".format(total_seq-remained_seq))
    return final_df


In [3]:
#path to the clusters folder. Change this to your own
path = '/home/bikash/Documents/Solubility/results/cross_val/clusters'
path = '/home/bikash/Documents/Solubility/results/cross_val/clusters'
clusters = []

#remove usearch and pET fasta from cluster directory before running this

for file in os.listdir(path):
    try:
        clusters.append(fasta_reader(os.path.abspath(os.path.join('clusters/', file))))
    except Exception:
        pass
#     print(os.path.abspath(os.path.join('clusters/', file)))


In [6]:
len(clusters)

4368

In [7]:
#remove character ----- from alignment

for index, c in enumerate(clusters):
    c['Prot'] = c['Sequence'].str.replace('-','')
    c['Cluster'] = index

In [8]:
#merge all clusters because we now have cluster number for each sequence
all_clusters = pd.concat(clusters)
all_clusters = all_clusters.reset_index(drop=True)
all_clusters.tail()

Unnamed: 0,Accession,Sequence,Prot,Cluster
12211,CtCD00384670_pET21,---------MSGPLRSVAILAQEKLGDCVLLTPLVRNLRQAFPDLG...,MSGPLRSVAILAQEKLGDCVLLTPLVRNLRQAFPDLGIHLITFSRA...,4365
12212,CtCD00384671_pET21,MRTRSGSAEFSGPLRSVAILAQEKLGDCVLLTPLVRNLRQAFPDLE...,MRTRSGSAEFSGPLRSVAILAQEKLGDCVLLTPLVRNLRQAFPDLE...,4365
12213,BjCD00540580_pET21,MPKWPDDDVILFDGVCIFCSRWVRFVAKRDTAKRFRFTPIQSDYGA...,MPKWPDDDVILFDGVCIFCSRWVRFVAKRDTAKRFRFTPIQSDYGA...,4366
12214,PmCD00584186_pET15,MPYLVIEHLEDISEWLLLEYKHVAQWWGDKLIFTNVKPKERKILAE...,MPYLVIEHLEDISEWLLLEYKHVAQWWGDKLIFTNVKPKERKILAE...,4367
12215,PmCD00344016_pET21,MPYLVIEHLEDISEWLLLEYKHVAQWWGDKLIFTNVKPKERKILAE...,MPYLVIEHLEDISEWLLLEYKHVAQWWGDKLIFTNVKPKERKILAE...,4367


In [9]:
#merge all clusters because we now have cluster number for each sequence
all_clusters = pd.concat(clusters)
all_clusters = all_clusters.reset_index(drop=True)
all_clusters.tail()

Unnamed: 0,Accession,Sequence,Prot,Cluster
12211,HsCD00598174_pET15,------MPPKDTIVKNITLNFGPQHPAAHGVLRLVMELSGEMVRKC...,MPPKDTIVKNITLNFGPQHPAAHGVLRLVMELSGEMVRKCDPHIGL...,4366
12212,HsCD00600517_pET15,-----------MIVKKITLNFGPQHPAAHGVLRLVMELSGEMVRKC...,MIVKKITLNFGPQHPAAHGVLRLVMELSGEMVRKCDPHIGLLHRGT...,4366
12213,SsCD00535991_pET21,----------------------------------------------...,MSLERQTLKQKLSTLIQPLQTAKRGAPLTNRTLSATTCQQIESLVT...,4366
12214,SsCD00535993_pET21,----------------------------------------------...,MTAKRGAPLTNRTLSATTCQQIESLVTAIEALNPNLSPLLYSPQLL...,4366
12215,AaCD00540113_pET21,MVTVIVQEGEPIEKVLKRFKARVEQEQILTELKRREYYEPPSERKK...,MVTVIVQEGEPIEKVLKRFKARVEQEQILTELKRREYYEPPSERKK...,4367


In [10]:
#we will merge this with full pET sequences based on accession, so we dont need these
#protein sequecnes from here because tags were removed before clustering
#After merging with pET sequences based on accession, we wil get tag added sequences

df_ = all_clusters[['Accession', 'Cluster']].copy()

In [11]:
#We want 10 sets each covering almost 10% of sequences (~1221 sequences per cluster)

#First split the df to 10 parts
test = np.array_split(df_, 10) 
#then check these parts and fix dfs by merging based on cluster at start and end of each df
#by checkiing tail and head of each df.
#based on that we make a list:

df0 = df_.loc[0:1223]
df1 = df_.loc[1224:2443]
df2 = df_.loc[2444:3666]
df3 = df_.loc[3667:4909]
df4 = df_.loc[4910:6113]
df5 = df_.loc[6114:7333]
df6 = df_.loc[7334:8554]
df7 = df_.loc[8555:9785]
df8 = df_.loc[9786:10998]
df9 = df_.loc[10999:12216]

dfs_cv = [df0, df1, df2, df3, df4, df5, df6, df7, df8, df9]

In [12]:
#check for seperation between clusters in these dfs

for i, d in enumerate(dfs_cv):
    try:
        #check if tail of one overlaps head of next cluster
        if d.tail(1)['Cluster'].values != d.head(1)['Cluster'].values:
            print('Pass')
        else:
            print('Overlapped dfs! : ', i, ' and ', i + 1)
    except Exception as exp: #catch for off by one error
        print(str(exp))

Pass
Pass
Pass
Pass
Pass
Pass
Pass
Pass
Pass
Pass


In [13]:
cv_test = [] #one df per item in list
cv_train = [] #nine remiaining df per item in list
for i, v in enumerate(dfs_cv):
    cv_test.append(v) #pick for testing
    cv_train.append([x for j, x in enumerate(dfs_cv) if j!=i ]) #pick remaining for training

In [14]:
cv_train_concat_nine = [pd.concat(i) for i in cv_train]

pET = pd.read_pickle('/home/bikash/Documents/Solubility/Solubility_manuscript/results/pET_complete.pkl.gz')

#merge with cleaned pET to remove problematic sequences

training_merged = [i.merge(pET, on='Accession') for i in cv_train_concat_nine]
testing_merged = [i.merge(pET, on='Accession') for i in cv_test]

Unnamed: 0,Accession,Cluster,Sequence,Class,Solubility,Protein
0,PpCD00338974_pET21,415,ATGGCTAACCACAAGCCGGAAATCGTCATCACCTATTGCACCCAGT...,2,1,MANHKPEIVITYCTQCQWLLRAAWLAQELLSTFADDLGRVALEPGT...
1,PfCD00344501_pET21,415,ATGACTGAACGTAAACCCGAAGTGATCATCACCTATTGCACCCAAT...,2,1,MTERKPEVIITYCTQCQWLLRAAWLAQELLSTFSDDLGKVSLEPAT...
2,HsCD00338620_pET21,416,ATGACTGAGCAGATGACCCTTCGTGGCACCCTCAAGGGCCACAACG...,2,1,MTEQMTLRGTLKGHNGWVTQIATTPQFPDMILSASRDKTIIMWKLT...
3,SsCD00540290_pET21,417,ATGCCCAGTATCTACATCTCAGGTGTCGATCCCACTATAGTTAAAA...,2,1,MPSIYISGVDPTIVKIHEIQGTGTASPLVNNVVTIEAIVVGDFQDG...
4,SsCD00540281_pET21,417,ATGACAGGTGTCGATCCCACTATAGTTAAAATCCATGAGATTCAAG...,1,0,MTGVDPTIVKIHEIQGTGTASPLVNNVVTIEAIVVGDFQDGDGDIS...


In [105]:
# Find pET15 sequences
df = all_clusters.copy()
df['pET15_tag'] = df['Accession'].apply(lambda x: 'pET15' in x)


In [125]:
#add tags
df1 = df.copy()
df1['P'] = df1.loc[df1['pET15_tag'] == True]['Prot'].apply(lambda x:'MGHHHHHHSH' + x)
df1.dropna(inplace=True)

df2 = df.copy()
df2['P'] = df2.loc[df2['pET15_tag'] == False]['Prot'].apply(lambda x:x + 'LEHHHHHH')
df2.dropna(inplace=True)

df_tags = pd.concat([df1, df2])
df_tags.sort_values(['Cluster'], inplace=True)
df_tags = df_tags.reset_index(drop=True)

In [127]:
df_tags = df_tags[['Accession', 'Cluster', 'P']].copy()
df_tags.shape

(12216, 3)

In [18]:
import pickle

with open('/home/bikash/Documents/Solubility/Solubility_manuscript/results/training.pkl', 'wb') as handle:
    pickle.dump(training_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('/home/bikash/Documents/Solubility/Solubility_manuscript/results/testing.pkl', 'wb') as handle:
    pickle.dump(testing_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)