In [None]:
from utils import *
import warnings
warnings.filterwarnings('ignore')

In [6]:
import pickle

with open('/sharedata/home/daihzh/protein/ESM4SL/data/SLKB/all_id_seq.pkl', 'rb') as f:
    all_id_seq = pickle.load(f)

combined_seq = []
for _, seq in all_id_seq:
    combined_seq += list(seq)

combined_seq = set(combined_seq)
print(len(combined_seq), combined_seq)

21 {'R', 'H', 'V', 'P', 'U', 'D', 'Y', 'Q', 'F', 'N', 'W', 'E', 'T', 'S', 'L', 'G', 'K', 'C', 'M', 'I', 'A'}


**UniProt Search**

In [17]:
entrez_ids = list([str(i) for i in gid2uid.keys()])
with open(f'{root}/mapping/gid_text.txt', 'w') as f:
    f.write(','.join(entrez_ids))

In [27]:
gid2protein_raw = pd.read_csv(f'{root}/mapping/gid2protein_raw.tsv', sep='\t')
gid2protein = gid2protein_raw.drop_duplicates(subset='From', keep='first')
gid2protein.to_csv(f'{root}/mapping/gid2protein.csv', index=False)

In [2]:
gid2protein = pd.read_csv(f'{root}/mapping/gid2protein.csv')
protein_coding_gids = set(gid2protein['From'])

**Cell-line Embeddings**

In [9]:
import re

cell_path = '/home/qingyuyang/SynergyX/data/0_cell_data/4079g/985_cellGraphs_exp_mut_cn_eff_dep_met_4079_genes_norm.npy'
cell_embedding = np.load(cell_path, allow_pickle=True).item()

cell_info = pd.read_csv('/home/qingyuyang/SynergyX/data/raw_data/cell_info.csv')
depmap_id = list(cell_info['depmap_id'])
cell_line_names = list(cell_info['cell_line_name'])
cell_line_names = [''.join(re.split(r'[ -]', i)).upper() for i in cell_line_names]
clname2id = dict(zip(cell_line_names, depmap_id))

clname2embed = {}
for cl_name, cl_id in clname2id.items():
    clname2embed[cl_name] = cell_embedding[cl_id]
np.save(f'{root}/mapping/clname2embed.npy', clname2embed, allow_pickle=True)

In [3]:
clname2embed = np.load(f'{root}/mapping/clname2embed.npy', allow_pickle=True).item()

**Preprocess SLKB**

In [4]:
sl_table_fp = os.path.join(root, 'SLKB/SLKB_original_scores.csv')

df = pd.read_csv(sl_table_fp)
df = df[['cell_line_origin', 'gene_1', 'gene_2', 'SL_or_not']]
df.replace({"SL_or_not": {'Not SL': 0, 'SL': 1}}, inplace=True)
df.columns = ['cell_line', 'g1', 'g2', 'label']

print(len(df), len(set(df['g1']) | set(df['g2'])) )
df.head()

280483 6125


Unnamed: 0,cell_line,g1,g2,label
0,RPE1,AKT1,AMBRA1,0
1,RPE1,AKT3,AMBRA1,0
2,RPE1,ARF6,AMBRA1,0
3,RPE1,ATF4,AMBRA1,0
4,RPE1,ATG10,AMBRA1,0


In [5]:
df = format_df(df)
df = df_gene2id(df)

print(len(df), len(set(df['0']) | set(df['1'])))
df.head()

253811 5906


Unnamed: 0,0,1,2,3
0,207,55626,0,RPE1
1,10000,55626,0,RPE1
2,382,55626,0,RPE1
3,468,55626,0,RPE1
4,83734,55626,0,RPE1


In [6]:
print("Gene num after alignment", len(set(protein_coding_gids).intersection(set(gid2uid.keys()))))

df = df[df['0'].isin(gid2uid.keys()) & df['1'].isin(gid2uid.keys())]
print("After 1st alignment, SLKB has", len(df), "items and", len(set(df['0']) | set(df['1'])), "unique genes")

df = df[df['0'].isin(protein_coding_gids) & df['1'].isin(protein_coding_gids)]
print("After 2nd alignment, SLKB has", len(df), "items and", len(set(df['0']) | set(df['1'])), "unique genes")

df = remove_rows_with_condition(df)  # NOTE: Remove those with contradicting labels!
print("After all alignment, SLKB has", len(df), "items and", len(set(df['0']) | set(df['1'])), "unique genes")

Gene num after alignment 9807
After 1st alignment, SLKB has 211688 items and 3024 unique genes
After 2nd alignment, SLKB has 211640 items and 3016 unique genes
After all alignment, SLKB has 211602 items and 3016 unique genes


In [7]:
df.to_csv(f'{root}/SLKB/SLKB_processed.csv', index=False)

In [2]:
df = pd.read_csv(f'{root}/SLKB/SLKB_processed.csv')

**Count Statistics**

In [27]:
root_path = f'{root}/SLKB'

table_save_path = f'{root_path}/cell_line/table'
os.makedirs(table_save_path, exist_ok=True)

fig_save_path = f'{root_path}/cell_line/gene_count'
os.makedirs(fig_save_path, exist_ok=True)

In [8]:
cell_lines = list(set(df['3']))
print('In total', len(cell_lines), 'cell lines')

In total 22 cell lines


In [9]:
column_names = ['cell_line', '# pairs', '# pos', '# neg', 'n/p ratio', '# genes', '# unique genes', 'cell_embed']
stat_df = pd.DataFrame(0, index=range(len(cell_lines)), columns=column_names)

for i in range(len(cell_lines)):
    cl = cell_lines[i]
    # print(cl)
    path = f'{table_save_path}/{cl}.csv'

    cl_df = select_cl_from_slkb(df, cl)
    # cl_df = remove_rows_with_condition(cl_df)
    cl_df.to_csv(path, index=False)

    total, pos, neg, ratio = count_pn_ratio(cl_df)
    gene_list, gene_set, gene_count = count_gene_freq(cl_df)
    unique_gene_num = list(gene_count.values()).count(1)

    if_cell_embed = 1 if cl in clname2embed else 0
    stat_df.iloc[i] = [cl, total, pos, neg, ratio, len(gene_set), unique_gene_num, if_cell_embed]

    visualize_gene_freq(gene_count, save_path=f'{fig_save_path}', title=cl)

stat_df.sort_values(by=["# unique genes", "# genes"], inplace=True, ascending=False, ignore_index=True)
stat_df.to_csv(f'{table_save_path}/overall_statistics.csv', index=False)
stat_df

Unnamed: 0,cell_line,# pairs,# pos,# neg,n/p ratio,# genes,# unique genes,cell_embed
0,MEWO,2538,279,2259,8.0968,2139,1138,1
1,IPC298,1952,315,1637,5.1968,1587,779,1
2,PK1,1952,372,1580,4.2473,1587,779,0
3,MEL202,1952,372,1580,4.2473,1587,779,0
4,GI1,1952,238,1714,7.2017,1587,779,0
5,HS936T,1952,338,1614,4.7751,1587,779,0
6,PATU8988S,1952,331,1621,4.8973,1587,779,0
7,HSC5,1952,370,1582,4.2757,1587,779,0
8,HS944T,1952,278,1674,6.0216,1587,779,0
9,MELJUSO,2230,279,1951,6.9928,1593,776,0


From the above table, we found that:
1. HT29 and 786O are unsuitable for SL prediction because of few positive samples;
2. Cell-line 15-19 has no 'tail' scenes;
3. Genes in cell-line 1-8 are the same, suitable for transfer scenes;
4. PC9's genes are all unique, so its C1, C2, and Tail are equivalent.

**CCLE Filtering (for MVGCN-iSL) (No need to run)**

In [3]:
gid2name = {gid: name for name, gid in name2id.items()}

In [None]:
cl_feat_path = '/home/qingyuyang/MVGCNiSL/data/cellline_feats'

for cl in ['A549', 'A375', 'IPC298', '22RV1', 'OVCAR8']:  #'K562', 'JURKAT', 
    cnv_file = f'{cl_feat_path}/{cl}_cnv.txt'
    cnv_genes = [line.strip().split()[0] for line in open(cnv_file, 'r').readlines()]
    ess_file = f'{cl_feat_path}/{cl}_ess.txt'
    ess_genes = [line.strip().split()[0] for line in open(ess_file, 'r').readlines()]
    exp_file = f'{cl_feat_path}/{cl}_exp.txt'
    exp_genes = [line.strip().split()[0] for line in open(exp_file, 'r').readlines()]
    mut_file = f'{cl_feat_path}/{cl}_mut.txt'
    mut_genes = [line.strip().split()[0] for line in open(mut_file, 'r').readlines()]
    all_valid_genes = list(set(cnv_genes).intersection(set(ess_genes)).intersection(set(exp_genes)).intersection(set(mut_genes)))
    all_valid_genes = set([int(name2id[i]) for i in all_valid_genes])
    print(cl, len(all_valid_genes))

    sl_table = pd.read_csv(f'/home/qingyuyang/ESM4SL/data/SLKB/cell_line/table/{cl}.csv')
    sl_genes = set(sl_table['0']) | set(sl_table['1'])  #set([gid2name[str(i)] for i in list()])
    invalid_sl_genes = sl_genes.difference(all_valid_genes)
    print(cl, len(sl_genes), len(invalid_sl_genes))

    # if len(invalid_sl_genes) > 0:
    #     sl_table_new = sl_table[sl_table['0'].isin(all_valid_genes) & sl_table['1'].isin(all_valid_genes)]
    #     sl_table_new.to_csv(f'/home/qingyuyang/ESM4SL/data/SLKB/cell_line/table/{cl}.csv', index=False)

**Cut C1/C2/Tail scenes**

In [4]:
specific_save_path = f'{root_path}/specific'
os.makedirs(specific_save_path, exist_ok=True)

In [28]:
stat_df = pd.read_csv(f'{table_save_path}/overall_statistics.csv')
new_cell_lines = stat_df['cell_line'][:-2]

for cl in tqdm(new_cell_lines):
    cl_df = pd.read_csv(f'{table_save_path}/{cl}.csv')

    split_instance = SpecificSplit(cl_df, f'{specific_save_path}/{cl}')
    split_instance.cell_line_specific_all()

 80%|████████  | 16/20 [00:46<00:31,  7.87s/it]

This cell line is unsuitable for long-tail scene!


 85%|████████▌ | 17/20 [00:58<00:27,  9.26s/it]

This cell line is unsuitable for long-tail scene!


 90%|█████████ | 18/20 [00:59<00:13,  6.69s/it]

This cell line is unsuitable for long-tail scene!


 95%|█████████▌| 19/20 [00:59<00:04,  4.75s/it]

This cell line is unsuitable for long-tail scene!


100%|██████████| 20/20 [00:59<00:00,  2.98s/it]

This cell line is unsuitable for long-tail scene!





In [29]:
count_specific_statistics(specific_save_path)

60it [00:24,  2.50it/s]


Unnamed: 0,cell_line,scene,fold,set,# pairs,# pos,# neg,n/p ratio,# genes,# unique genes
0,MEWO,C1,0,0,1624,179,1445,8.0726,1643,1005
1,MEWO,C1,0,1,406,44,362,8.2273,632,508
2,MEWO,C1,0,2,508,56,452,8.0714,767,611
3,MEWO,C1,1,0,1624,179,1445,8.0726,1662,1025
4,MEWO,C1,1,1,406,44,362,8.2273,633,511
...,...,...,...,...,...,...,...,...,...,...
820,22RV1,C2,3,1,196,54,142,2.6296,35,0
821,22RV1,C2,3,2,252,55,197,3.5818,37,0
822,22RV1,C2,4,0,378,122,256,2.0984,28,0
823,22RV1,C2,4,1,224,43,181,4.2093,36,0


**Form Transfer scenes**

In [20]:
def count_SL_intersection(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[float, float]:
    merged_pair = pd.merge(df1, df2, on=['0', '1'])
    merged_pair_label = pd.merge(df1, df2, on=['0', '1', '2'])
    return len(merged_pair), len(merged_pair_label)

column_names = ['source', 'target', 'source len', 'target len', '# common SL pairs', '# common SL pairs & labels']
datasets_stat = pd.DataFrame(columns=column_names)
cell_names = ['A549', 'PK1', 'PC9', '22RV1', 'MEWO'] #'MELJUSO', 
combinations = list(itertools.combinations(cell_names, 2))
for source, target in tqdm(combinations):
    source_df = pd.read_csv(f'{table_save_path}/{source}.csv')
    target_df = pd.read_csv(f'{table_save_path}/{target}.csv')
    num1, num2 = count_SL_intersection(source_df, target_df)
    datasets_stat.loc[len(datasets_stat)] = [source, target, len(source_df), len(target_df), num1, num2]
datasets_stat

100%|██████████| 10/10 [00:00<00:00, 120.44it/s]


Unnamed: 0,source,target,source len,target len,# common SL pairs,# common SL pairs & labels
0,A549,PK1,5769,1952,1942,1592
1,A549,PC9,5769,358,51,38
2,A549,22RV1,5769,946,49,37
3,A549,MEWO,5769,2538,1955,1599
4,PK1,PC9,1952,358,51,38
5,PK1,22RV1,1952,946,3,1
6,PK1,MEWO,1952,2538,1947,1536
7,PC9,22RV1,358,946,0,0
8,PC9,MEWO,358,2538,83,65
9,22RV1,MEWO,946,2538,5,4


In [24]:
transfer_save_path = f'{root_path}/transfer'
os.makedirs(transfer_save_path, exist_ok=True)

cell_names = ['A549', 'PK1', 'PC9', '22RV1', 'MEWO']
# combinations = list(itertools.combinations(cell_names, 2))
# new_comb = [(i[1], i[0]) for i in combinations]
# whole_comb = combinations + new_comb

for source in tqdm(cell_names):
    source_df = pd.read_csv(f'{table_save_path}/{source}.csv')
    onetoone = TransferSplit(source_df, f'{transfer_save_path}/{source}')
    onetoone.Transfer()

100%|██████████| 5/5 [00:00<00:00, 17.65it/s]


**SLGNNCT: change all genes back to name, to match its KG database**

In [26]:
id2name = {int(idx): name for name, idx in name2id.items()}

In [None]:
slgnnct_specific_save_path = f'{root_path}/slgnnct/specific'
os.makedirs(slgnnct_specific_save_path, exist_ok=True)

for cl in cell_lines:
    for scene in ['C1', 'C2', 'Tail']:
        if not os.path.exists(f'{specific_save_path}/{cl}/{scene}'):
            continue
        save_path = f'{slgnnct_specific_save_path}/{cl}/{scene}'
        os.makedirs(save_path, exist_ok=True)

        raw_path = f'{specific_save_path}/{cl}/{scene}'
        for fn in os.listdir(raw_path):
            sl = pd.read_csv(f'{raw_path}/{fn}')
            old_len = len(sl)
            sl['0'] = sl['0'].map(id2name)
            sl['1'] = sl['1'].map(id2name)
            sl.dropna(inplace=True)
            new_len = len(sl)
            assert old_len == new_len
            sl.to_csv(f'{save_path}/{fn}', index=False)

In [28]:
slgnnct_transfer_save_path = f'{root_path}/slgnnct/transfer'
os.makedirs(slgnnct_transfer_save_path, exist_ok=True)

cell_names = ['A549', 'PK1', 'PC9', '22RV1', 'MEWO']

for cl in tqdm(cell_names):
    save_path = f'{slgnnct_transfer_save_path}/{cl}'
    os.makedirs(save_path, exist_ok=True)

    raw_path = f'{transfer_save_path}/{cl}'
    for fn in os.listdir(raw_path):
        sl = pd.read_csv(f'{raw_path}/{fn}')
        old_len = len(sl)
        sl['0'] = sl['0'].map(id2name)
        sl['1'] = sl['1'].map(id2name)
        sl.dropna(inplace=True)
        new_len = len(sl)
        assert old_len == new_len
        sl.to_csv(f'{save_path}/{fn}', index=False)

for cl in tqdm(cell_names):
    sl = pd.read_csv(f'{table_save_path}/{cl}.csv')
    old_len = len(sl)
    sl['0'] = sl['0'].map(id2name)
    sl['1'] = sl['1'].map(id2name)
    sl.dropna(inplace=True)
    new_len = len(sl)
    assert old_len == new_len
    sl.to_csv(f'{slgnnct_transfer_save_path}/{cl}.csv', index=False)

100%|██████████| 5/5 [00:01<00:00,  4.69it/s]
100%|██████████| 5/5 [00:00<00:00, 53.92it/s]
