In [2]:
import pandas as pd
import numpy as np

#### Explore the dataset from the Copenhagen Bioinformatics Hackathon

In [3]:
#extract .csv file from the acrhive
#!gunzip train115k.csv.gz

In [4]:
old_set = pd.read_csv("train115k.csv", index_col=0)
print("Number of unique sequences in the dataset from the Hackaton: ", old_set.shape[0])
assert old_set['sequence'].nunique() == old_set.shape[0] #to be sure that all sequences are unique

old_set.head()

Number of unique sequences in the dataset from the Hackaton:  115803


Unnamed: 0,mean_growth_PH,sequence
0,7.0,MKKRAHIISFILILALLFTGCSGNKENTSKEPVKETTEKGTGNIKT...
1,7.0,MGKGKRKKRIALYFKRAAVAMLVMVMLLQPIPGTAGSSVKSVEAAV...
2,7.0,MKVNNKNSARKLLSLFLGLVLIFSTLSFSNQAAAADKGTWAPNTTY...
3,7.0,MRKKVTAVLTALVLTVSSILANPFAYPDAVKADTEGNPAAASNSNG...
4,7.0,MKRQSRIISFLVAVIMIATVIMPATVVQANASGVFIRVNQVGYKPS...


#### Explore the gold_sign_recent dataset

In [5]:
#extract .csv file from the archive
#!gunzip gold_sign_recent.csv.gz

In [6]:
gold_set = pd.read_csv("gold_sign_recent.csv", index_col=0)
print("Number of sequences in the gold dataset: ", gold_set.shape[0])
print("Number of duplicated sequences in the gold dataset: ", gold_set.duplicated('sequence_hashkey').sum())

gold_set.head()

Number of sequences in the gold dataset:  247651
Number of duplicated sequences in the gold dataset:  1


Unnamed: 0,key,sequence,sequence_hashkey,pfam_hits,value_cur_count,value_cur_mean,value_cur_std,species_taxid
0,TREMBL:A0A1M7YMF3,MKKRAHIISFILILALLFTGCSGNKENTSKEPVKETTEKGTGNIKT...,4140c1bfe158c54bf8341936919bfbdc,PF00128,1.0,6.0,,100134
1,TREMBL:A0A1M7YE49,MGKGKRKKRIALYFKRAAVAMLVMVMLLQPIPGTAGSSVKSVEAAV...,f0234bf7d0ae0e73f36e583b04330e11,"PF00722,PF02018,PF00041",1.0,6.0,,100134
2,TREMBL:A0A1M7Y718,MNNIPYKKCIIIAMLFICAYFLGGGVAMLVQAYGKDSVKEEANWGL...,74a64ff43b3ab606df759f33df2f53ca,PF01522,1.0,6.0,,100134
3,TREMBL:A0A1M7YIY6,MKVNNKNSARKLLSLFLGLVLIFSTLSFSNQAAAADKGTWAPNTTY...,afa9ff2d974a4d6c30a1fee00749bed7,"PF13290,PF13290,PF13287,PF13287,PF00704",1.0,6.0,,100134
4,TREMBL:A0A1M7YIF7,MKRQSRIISFLVAVIMIATVIMPATVVQANASGVFIRVNQVGYKPS...,980e447cfab03b9ff6d8e3ebe33fd8c5,"PF00759,PF02927",1.0,6.0,,100134


In [7]:
print("Intersection of the old and gold sets: ",
      len(set(gold_set['sequence'].unique()).intersection(old_set['sequence'].unique())))

Intersection of the old and gold sets:  108571


#### Merge both datasets

In [8]:
old_set['value_cur_mean'] = old_set['mean_growth_PH']
merged_set = pd.concat([gold_set, old_set], axis=0, ignore_index=True)
merged_set.drop(['mean_growth_PH'], axis=1, inplace=True)
print("Number of entries after merging: ", merged_set.shape[0])
merged_set.head()

Number of entries after merging:  363454


Unnamed: 0,key,sequence,sequence_hashkey,pfam_hits,value_cur_count,value_cur_mean,value_cur_std,species_taxid
0,TREMBL:A0A1M7YMF3,MKKRAHIISFILILALLFTGCSGNKENTSKEPVKETTEKGTGNIKT...,4140c1bfe158c54bf8341936919bfbdc,PF00128,1.0,6.0,,100134.0
1,TREMBL:A0A1M7YE49,MGKGKRKKRIALYFKRAAVAMLVMVMLLQPIPGTAGSSVKSVEAAV...,f0234bf7d0ae0e73f36e583b04330e11,"PF00722,PF02018,PF00041",1.0,6.0,,100134.0
2,TREMBL:A0A1M7Y718,MNNIPYKKCIIIAMLFICAYFLGGGVAMLVQAYGKDSVKEEANWGL...,74a64ff43b3ab606df759f33df2f53ca,PF01522,1.0,6.0,,100134.0
3,TREMBL:A0A1M7YIY6,MKVNNKNSARKLLSLFLGLVLIFSTLSFSNQAAAADKGTWAPNTTY...,afa9ff2d974a4d6c30a1fee00749bed7,"PF13290,PF13290,PF13287,PF13287,PF00704",1.0,6.0,,100134.0
4,TREMBL:A0A1M7YIF7,MKRQSRIISFLVAVIMIATVIMPATVVQANASGVFIRVNQVGYKPS...,980e447cfab03b9ff6d8e3ebe33fd8c5,"PF00759,PF02927",1.0,6.0,,100134.0


#### Analyze duplicates and problem sequences

In [9]:
#group entries by the sequence
grouped_set = merged_set.groupby('sequence').apply(lambda x: (x['value_cur_count'].unique(), x['value_cur_mean'].unique())).reset_index()
grouped_set['value_cur_count']= grouped_set[0].apply(lambda x: x[0][0] if len(x) > 1 else x[0])
grouped_set['value_cur_mean']= grouped_set[0].apply(lambda x: x[1])
grouped_set.drop(0, axis=1, inplace=True)

#check if the proteins start with methionine
grouped_set['starts_with_Met'] = grouped_set['sequence'].apply(lambda x: True if x[0] == 'M' else False)

#check the number of duplicates and the inconsistencies between old and new pHs
grouped_set['is_duplicated'] = grouped_set['value_cur_mean'].apply(lambda x: True if len(x) > 1 else False)
grouped_set['delta'] = grouped_set['value_cur_mean'].apply(lambda x: np.abs(x[0] - x[1]) if len(x) > 1 else None)
grouped_set['transition'] = grouped_set['value_cur_mean'].apply(lambda x: True if len(x) > 1 and ((x[0] < 7 and x[1] > 7) or (x[0] > 7 and x[1] < 7)) else False)

In [10]:
grouped_set.head()

Unnamed: 0,sequence,value_cur_count,value_cur_mean,starts_with_Met,is_duplicated,delta,transition
0,AAAACIPLLLGSAPLYAQTSAVQQKLAALEKSSGGRLGVALIDTAD...,21.0,"[5.557142857142856, 7.0]",False,True,1.442857,False
1,AAADGGGAGARAPASSTFWFLLHALCCLISLFLGFRFSRLLFFLLF...,1.0,[6.0],False,False,,False
2,AAAFAVVGFCSTASAVTYPLPTDGSRLVGQNQVITIPEGNTQPLEY...,21.0,"[5.557142857142856, 7.0]",False,True,1.442857,False
3,AAAIAVVYLSLLLLLLHGAAPAVLGYTRGDFPEDFVFGSATSSYQY...,2.0,[5.0],False,False,,False
4,AAATSPARAAAWTFAAATCVKLLLVPTYRSTDFDVHRYWLALTHAL...,2.0,[5.0],False,False,,False


In [11]:
print("Number of sequences without Methionine at the beginning: ",
      (~grouped_set['starts_with_Met']).sum())
print("Number of duplicated sequences with different pHs: ",
      grouped_set[(grouped_set['is_duplicated'] == True) & (grouped_set['delta'] > 0)].shape[0])
print("Number of duplicated sequences with |old_pH - new_pH|> 2 : ",
      grouped_set[(grouped_set['is_duplicated'] == True) & (grouped_set['delta'] > 2)].shape[0])
print("Number of duplicated sequences with value_cur_count == 1 : ",
      grouped_set[(grouped_set['is_duplicated'] == True) & (grouped_set['value_cur_count'] == 1)].shape[0])

Number of sequences without Methionine at the beginning:  2073
Number of duplicated sequences with different pHs:  90528
Number of duplicated sequences with |old_pH - new_pH|> 2 :  17968
Number of duplicated sequences with value_cur_count == 1 :  30296
