In [1]:
import os
import pickle
import pandas as pd
import numpy as np
np.random.seed(12345)

def fasta_reader(file):
    '''Converts .fasta to a pandas dataframe with accession as index
    and sequence in a column 'sequence'
    '''
    fasta_df = pd.read_csv(file, sep='>', lineterminator='>', header=None)
    fasta_df[['Accession', 'Sequence']] = fasta_df[0].str.split('\n', 1, \
                                        expand=True)
    fasta_df['Accession'] = fasta_df['Accession']
    fasta_df['Sequence'] = fasta_df['Sequence'].replace('\n', '', regex=True).\
                            astype(str).str.upper().replace('U', 'T')
    total_seq = fasta_df.shape[0]
    fasta_df.drop(0, axis=1, inplace=True)
    fasta_df = fasta_df[fasta_df.Sequence != '']
    fasta_df = fasta_df[fasta_df.Sequence != 'NONE']
    final_df = fasta_df.dropna()
    remained_seq = final_df.shape[0]
    if total_seq != remained_seq:
        print("{} sequences were removed due to inconsistencies in"
                      "provided file.".format(total_seq-remained_seq))
    return final_df


root_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))

In [2]:
path = root_dir + '/Cross_validation/Clustering/10'
clusters = []



for file in os.listdir(path):
    try:
        clusters.append(fasta_reader(os.path.join(path, file)))
    except Exception:
        pass

len(clusters) #Number of clusters

4368

In [24]:
#remove character ----- from alignment

for index, c in enumerate(clusters):
    c['Prot'] = c['Sequence'].str.replace('-','')
    c['Cluster'] = index

all_clusters = pd.concat(clusters)
all_clusters = all_clusters.reset_index(drop=True)

#we will merge this with full pET sequences based on accession, so we dont need these
#protein sequecnes from here because tags were removed before clustering
#After merging with pET sequences based on accession, we will get tag added sequences

df_ = all_clusters[['Accession', 'Cluster']].copy()

#We want 10 sets each covering almost 10% of sequences (~1221 sequences per cluster)
df0 = df_.loc[0:1224]
df1 = df_.loc[1225:2440]
df2 = df_.loc[2441:3673]
df3 = df_.loc[3674:4915]
df4 = df_.loc[4916:6116]
df5 = df_.loc[6117:7335]
df6 = df_.loc[7336:8555]
df7 = df_.loc[8556:9787]
df8 = df_.loc[9788:10997]
df9 = df_.loc[10998:12216]

dfs_cv = [df0, df1, df2, df3, df4, df5, df6, df7, df8, df9]
#check for seperation between clusters in these dfs

for i, d in enumerate(dfs_cv):
    try:
        #check if tail of one overlaps head of next cluster
        if d.tail(1)['Cluster'].values != dfs_cv[i+1].head(1)['Cluster'].values:
            print('Pass')
        else:
            print('Overlapped dfs! : ', i, ' and ', i + 1)
    except Exception as exp: #catch for off by one error
        print('Last item reached.')

cv_test = [] #one df per item in list
cv_train = [] #nine remiaining df per item in list
for i, v in enumerate(dfs_cv):
    cv_test.append(v) #pick for testing
    cv_train.append([x for j, x in enumerate(dfs_cv) if j!=i ]) #pick remaining for training


    
cv_train_concat_nine = [pd.concat(i) for i in cv_train]


Pass
Pass
Pass
Pass
Pass
Pass
Pass
Pass
Pass
Last item reached.


In [25]:

pET = pd.read_pickle(root_dir + '/results/pET_complete.pkl.gz')

#merge with cleaned pET to remove problematic sequences


training_merged = [i.merge(pET, on='Accession') for i in cv_train_concat_nine]
testing_merged = [i.merge(pET, on='Accession') for i in cv_test]


with open(root_dir + '/results/training_10_with_tag.pkl', 'wb') as handle:
    pickle.dump(training_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(root_dir + '/results/testing_10_with_tag.pkl', 'wb') as handle:
    pickle.dump(testing_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Export tags without tags

In [26]:
pET_notag = pET.copy()
pET_notag['Protein']['pET15'] = pET_notag['Protein']['pET15'].apply(lambda x:x[10:])
pET_notag['Protein']['pET21'] = pET_notag['Protein']['pET21'].apply(lambda x:x[:-8])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [27]:
training_merged_notag = [i.merge(pET_notag, on='Accession') for i in cv_train_concat_nine]
testing_merged_notag = [i.merge(pET_notag, on='Accession') for i in cv_test]

In [28]:
with open(root_dir + '/results/training_10_without_tag.pkl', 'wb') as handle:
    pickle.dump(training_merged_notag, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(root_dir + '/results/testing_10_without_tag.pkl', 'wb') as handle:
    pickle.dump(testing_merged_notag, handle, protocol=pickle.HIGHEST_PROTOCOL)