In [None]:
!pip install pandas
!pip install dask

In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import os
# Create a local cluster
# Get the number of physical cores
n_cores = os.cpu_count()
print(n_cores)
# Create a local cluster with one worker per core
cluster = LocalCluster(n_workers=int(n_cores/5), threads_per_worker=1, memory_limit='80GB')

# Connect to the cluster
client = Client(cluster)

# Your code here

import os
import csv

Count_GO = True
E_filter = False
root_terms = set(['GO:0008150', 'GO:0003674', 'GO:0005575'])

128


## read-in result of MMseqs search

In [3]:
# os.chdir('./preprocessing')
#(1,2) identifiers for query and target sequences/profiles, (3) sequence identity, 
#(4) alignment length, (5) number of mismatches, (6) number of gap openings, 
#(7-8, 9-10) domain start and end-position in query and in target, (11) E-value, and (12) bit score.
alnRes = dd.read_csv('./data/alnRes.m8',sep='\t',header=None)
alnRes.columns = ['EntryID', 'cluster', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'Evalue', 'bitscore']

alnRes.head(5)

Unnamed: 0,EntryID,cluster,col3,col4,col5,col6,col7,col8,col9,col10,Evalue,bitscore
0,Q6ENB0,7f9o_Q,0.931,735,50,0,1,730,43,777,0.0,1354
1,Q6ENB0,7eu3_F,0.931,735,50,0,1,730,43,777,0.0,1354
2,Q6ENB0,7wg5_F,0.685,735,229,0,1,729,1,735,1.667e-320,990
3,Q6ENB0,7wff_F,0.685,735,229,0,1,729,1,735,1.667e-320,990
4,Q6ENB0,6khi_F,0.442,717,355,0,1,717,1,638,3.163e-173,563


In [3]:
alnRes.compute().describe([.30])

Unnamed: 0,col3,col4,col5,col6,col7,col8,col9,col10,Evalue,bitscore
count,13781890.0,13781890.0,13781890.0,13781891.0,13781890.0,13781890.0,13781890.0,13781890.0,13781890.0,13781890.0
mean,0.4587226,283.9119,148.2073,0.0,185.7246,464.4306,52.42068,328.2708,1.286453e-05,229.3885
std,0.2129429,249.9623,146.758,0.0,547.1455,592.8421,174.3324,305.9957,7.926572e-05,292.9405
min,0.165,13.0,0.0,0.0,1.0,15.0,1.0,15.0,0.0,36.0
30%,0.319,160.0,68.0,0.0,17.0,248.0,4.0,182.0,6.822e-65,90.0
50%,0.383,241.0,124.0,0.0,46.0,356.0,11.0,275.0,6.366e-36,146.0
max,1.0,5332.0,3559.0,0.0,35277.0,35375.0,4865.0,5148.0,0.0009999,10468.0


In [4]:
alnRes_clu = alnRes.cluster.unique().compute().size
print("add clusters:",alnRes_clu)
alnRes_entry = alnRes.EntryID.unique().compute().size
print("Entry:",alnRes_entry)

add clusters: 570450
Entry: 112542


## read-in kaggle train_terms for expanding

In [5]:
train_terms = dd.read_csv('./data/train_terms.tsv',sep='\t')
print('add GO terms',train_terms.term.unique().compute().size)

add GO terms 31466


In [6]:
GOBPO = list(np.load('./data/train_targets_BPO_top21285.npy',allow_pickle=True))
GOCCO = list(np.load('./data/train_targets_CCO_top2957.npy',allow_pickle=True))
GOMFO = list(np.load('./data/train_targets_MFO_top7224.npy',allow_pickle=True))
GOs = GOBPO + GOCCO + GOMFO
len(GOs)

31466

In [7]:
train_terms = train_terms[train_terms['term'].isin(GOs)]

## Apply filters

In [8]:
alnRes = alnRes[alnRes['Evalue']<1e-6]

In [4]:
if Count_GO:
    try:
        clu_go_count = dd.read_csv('./data/clu_go_count_all.csv')
        clu_go_count = clu_go_count.compute()
    except:
        merged_clu = dd.merge(alnRes, train_terms, on='EntryID',how='left')
        clu_go_count = merged_clu.groupby(['cluster', 'term']).size().reset_index().rename(columns={0: 'freq'})
        clu_go_count = clu_go_count.compute()
        clu_go_count.to_csv('clu_go_count_all.csv')

In [7]:
if Count_GO:
    clu_annot = clu_go_count.cluster.unique().size
    print('add clusters:',clu_annot)
    term_annot = clu_go_count.term.unique().size
    print('add terms:', term_annot)

add clusters: 560066
add terms: 30453


In [5]:
if Count_GO:
    print('del clusters:',clu_go_count[clu_go_count.freq<=1].cluster.unique().size)
    print('del clusters:',clu_go_count[clu_go_count.freq<=2].cluster.unique().size)
    print('add terms:',clu_go_count[clu_go_count.freq>2].term.unique().size)
    print('add terms:',clu_go_count[clu_go_count.freq>5].term.unique().size)
    
    print(clu_go_count[(clu_go_count.freq>1)&(~clu_go_count.term.isin(root_terms))].describe())
    print(clu_go_count[(clu_go_count.freq>2)&(~clu_go_count.term.isin(root_terms))].describe())

del clusters: 549338
del clusters: 557660
add terms: 16376
add terms: 9676
         Unnamed: 0          freq
count  6.718676e+07  6.718676e+07
mean   5.013073e+07  6.755435e+00
std    3.372147e+07  1.221350e+01
min    6.000000e+00  2.000000e+00
25%    2.159522e+07  2.000000e+00
50%    4.526052e+07  3.000000e+00
75%    7.485525e+07  7.000000e+00
max    1.396529e+08  7.680000e+02
         Unnamed: 0          freq
count  4.398507e+07  4.398507e+07
mean   4.036014e+07  9.263881e+00
std    2.884000e+07  1.447874e+01
min    6.000000e+00  3.000000e+00
25%    1.672544e+07  3.000000e+00
50%    3.488898e+07  5.000000e+00
75%    5.965619e+07  9.000000e+00
max    1.396448e+08  7.680000e+02


In [None]:
if E_filter:    
    # Group the data by the id column
    result = alnRes[alnRes['Evalue']<1e-6]
    grouped = result.groupby('cluster')

    # Define a function to get the two rows with the smallest values in column 10 for each group
    def get_smallest_two(group):
        return group.nsmallest(5, 'Evalue')
    print('filter')
    # Apply the function to each group

    result = grouped.apply(get_smallest_two, meta=result)

    # Reset the index
    result = result.compute()

    result = result.reset_index(drop=True)
    # Display the result
    # print(result)
    result.describe([.30])

In [7]:
if E_filter:
    result.to_csv('./data/alnRes_5byCLU.csv')

In [None]:
if E_filter:
    filter_clu = result.col1.unique().size
    print("add clusters:",filter_clu)
    print('del clusters:',alnRes_clu-filter_clu)
    filter_entry = result.EntryID.unique().size
    print("Entry:",filter_entry)
    print('del Entry:',alnRes_entry-filter_entry)

add clusters: 560066
del clusters: 10384
Entry: 70669
del Entry: 41873


In [15]:
if E_filter:  
    result2 = alnRes[alnRes['Evalue']<=1e-60]
    result2.compute().describe([.30])



Unnamed: 0,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11
count,4429860.0,4429860.0,4429860.0,4429860.0,4429860.0,4429860.0,4429860.0,4429860.0,4429860.0,4429860.0
mean,0.6287138,430.3339,176.2776,0.0,119.698,544.6187,30.89267,452.8418,6.994112e-63,480.3307
std,0.22964,353.1053,214.3038,0.0,399.7497,533.5888,140.6221,386.2059,5.8698170000000005e-62,410.9411
min,0.191,84.0,0.0,0.0,1.0,84.0,1.0,84.0,0.0,205.0
30%,0.455,278.0,59.0,0.0,4.0,329.0,2.0,288.0,1.7520000000000001e-153,289.0
50%,0.58,351.0,134.0,0.0,22.0,428.0,5.0,363.0,6.183e-109,365.0
max,1.0,5332.0,3559.0,0.0,33147.0,33567.0,4714.0,5148.0,1.0000000000000001e-60,10468.0


In [6]:
# result = pd.read_csv('./data/alnRes_2byCLU.csv',sep='\t')
# result.describe()

In [7]:
if E_filter:  
    fluster_clu = result2.col1.unique().compute().size
    print("fluster_clus:",fluster_clu)
    fluster_entry = result2.EntryID.unique().compute().size
    print("fluster_Entry:",fluster_entry)

fluster_clus: 405560
fluster_Entry: 70827


In [8]:
if E_filter:  
    print('del clusters:',alnRes_clu-fluster_clu)
    print('del Entry:',alnRes_entry-fluster_entry)

del clusters: 164890
del Entry: 41715


In [7]:
if E_filter:  
    train_chain_go = pd.merge(result[['EntryID','cluster']], train_terms[['EntryID', 'term']],how='left')
if Count_GO:
    train_chain_go = clu_go_count[clu_go_count.freq>2][['cluster', 'term']]

In [8]:
old_data = pd.read_csv('./data/pdb_chain_go.tsv',sep='\t',skiprows=1)
old_data.head(5)

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,WITH_STRING,EVIDENCE,GO_ID
0,101m,A,P02185,UniProtKB-KW:KW-0479,IEA,GO:0046872
1,101m,A,P02185,UniProtKB-KW:KW-0561,IEA,GO:0005344
2,101m,A,P02185,UniProtKB-KW:KW-0561,IEA,GO:0015671
3,102m,A,P02185,UniProtKB-KW:KW-0479,IEA,GO:0046872
4,102m,A,P02185,UniProtKB-KW:KW-0561,IEA,GO:0005344


In [12]:
_ = old_data.PDB + old_data.CHAIN
print('original clus:',_.unique().size)

original clus: 418330


In [13]:
print('add clus from fluster_clu:' ,len(set(result.col1.unique().compute()).difference(_.unique())))
print('more clus from alnRes:' ,len(set(alnRes.col1.unique().compute()).difference(_.unique())))

AttributeError: 'DataFrame' object has no attribute 'unique'

In [15]:
new_data = pd.DataFrame({
    'PDB': train_chain_go['cluster'].str.split('_').str[0],
    'CHAIN': train_chain_go['cluster'].str.split('_').str[1],
    'SP_PRIMARY': 'Virtul' if Count_GO else train_chain_go['EntryID'],
    'WITH_STRING': 'Kaggle',
    'EVIDENCE': 'train',
    'GO_ID': train_chain_go['term']
})

# Write the new DataFrame to a TSV file
new_data.head(5)

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,WITH_STRING,EVIDENCE,GO_ID
0,11gs,B,V9HWE9,Kaggle,train,GO:0005829
1,11gs,B,V9HWE9,Kaggle,train,GO:0005622
2,11gs,B,V9HWE9,Kaggle,train,GO:0043229
3,11gs,B,V9HWE9,Kaggle,train,GO:0043226
4,11gs,B,V9HWE9,Kaggle,train,GO:0110165


In [17]:
pdb_chain_go_new = pd.concat([old_data,new_data], axis=0)
pdb_chain_go_new.to_csv('./data/pdb_chain_go_new_5byclu.tsv',index=False,sep='\t')