In [5]:
import pandas as pd

# 读取txt文件，假设文件名为'genes.txt'
# 使用制表符\t作为分隔符，没有表头
scg_dict = pd.read_csv('./data/scgpt_gene2idx.txt', sep='\t', header=None, names=['gene_name', 'scg_idx'])

# 显示读取的数据
print(scg_dict)

            gene_name  scg_idx
0        RP5-973N23.5    60693
1      RP11-182N22.10    60689
2          CTB-53D8.3    60687
3       RP11-348N17.2    60685
4       RP11-205M20.8    60682
...               ...      ...
60692    RP11-390P2.2    42033
60693       SGSM3-AS1    31087
60694   RP11-812I20.2    42036
60695      AC022154.7      828
60696          RBMXP1    42041

[60697 rows x 2 columns]


In [6]:
import pandas as pd

# 读取整个CSV文件
sl_data = pd.read_csv('./data/SLKB_rawSL.csv')

# 查看前5行
print(sl_data.head())


   Unnamed: 0     gene_pair  study_origin cell_line_origin gene_1  gene_2  \
0           1   AKT1|AMBRA1      33956155             RPE1   AKT1  AMBRA1   
1           2   AKT3|AMBRA1      33956155             RPE1   AKT3  AMBRA1   
2           3   AMBRA1|ARF6      33956155             RPE1   ARF6  AMBRA1   
3           4   AMBRA1|ATF4      33956155             RPE1   ATF4  AMBRA1   
4           5  AMBRA1|ATG10      33956155             RPE1  ATG10  AMBRA1   

  SL_or_not  SL_score  statistical_score  SL_score_cutoff  \
0    Not SL -0.010982                0.0             -1.0   
1    Not SL  2.159344                0.0             -1.0   
2    Not SL -0.564699                0.0             -1.0   
3    Not SL  0.999030                0.0             -1.0   
4    Not SL  3.916281                0.0             -1.0   

   statistical_score_cutoff  
0                       0.0  
1                       0.0  
2                       0.0  
3                       0.0  
4                   

In [9]:
# 将 df 转换为字典，方便快速查找
gene_to_number = dict(zip(scg_dict['gene_name'], scg_dict['scg_idx']))

# 为 gene1 和 gene2 添加对应的数值列
sl_data['gene_1_scg_idx'] = sl_data['gene_1'].map(gene_to_number)
sl_data['gene_2_scg_idx'] = sl_data['gene_2'].map(gene_to_number)

# 显示结果
print(sl_data.head())

   Unnamed: 0     gene_pair  study_origin cell_line_origin gene_1  gene_2  \
0           1   AKT1|AMBRA1      33956155             RPE1   AKT1  AMBRA1   
1           2   AKT3|AMBRA1      33956155             RPE1   AKT3  AMBRA1   
2           3   AMBRA1|ARF6      33956155             RPE1   ARF6  AMBRA1   
3           4   AMBRA1|ATF4      33956155             RPE1   ATF4  AMBRA1   
4           5  AMBRA1|ATG10      33956155             RPE1  ATG10  AMBRA1   

  SL_or_not  SL_score  statistical_score  SL_score_cutoff  \
0    Not SL -0.010982                0.0             -1.0   
1    Not SL  2.159344                0.0             -1.0   
2    Not SL -0.564699                0.0             -1.0   
3    Not SL  0.999030                0.0             -1.0   
4    Not SL  3.916281                0.0             -1.0   

   statistical_score_cutoff  gene_1_scg_idx  gene_2_scg_idx  
0                       0.0          1820.0          1942.0  
1                       0.0          1823.0   

In [12]:
import pandas as pd

# 计算覆盖率的函数
def calculate_coverage_stats(group):
    # 计算gene_1的覆盖率
    gene_1_total = group['gene_1'].nunique()
    gene_1_covered = group[group['gene_1_scg_idx'].notnull()]['gene_1'].nunique()
    gene_1_coverage = gene_1_covered / gene_1_total if gene_1_total > 0 else 0
    
    # 计算gene_2的覆盖率
    gene_2_total = group['gene_2'].nunique()
    gene_2_covered = group[group['gene_2_scg_idx'].notnull()]['gene_2'].nunique()
    gene_2_coverage = gene_2_covered / gene_2_total if gene_2_total > 0 else 0
    
    # 计算基因对覆盖率
    total_pairs = len(group)
    covered_pairs = group[group['gene_1_scg_idx'].notnull() & group['gene_2_scg_idx'].notnull()].shape[0]
    pair_coverage = covered_pairs / total_pairs if total_pairs > 0 else 0
    
    return pd.Series({
        'gene_1_unique_count': gene_1_total,
        'gene_1_covered_count': gene_1_covered,
        'gene_1_coverage': gene_1_coverage,
        'gene_2_unique_count': gene_2_total,
        'gene_2_covered_count': gene_2_covered,
        'gene_2_coverage': gene_2_coverage,
        'total_pairs': total_pairs,
        'covered_pairs': covered_pairs,
        'pair_coverage': pair_coverage
    })

# 按cell_line_origin分组计算覆盖率
coverage_by_cell_line = sl_data.groupby('cell_line_origin').apply(calculate_coverage_stats).reset_index()

# 格式化输出百分比
percentage_cols = ['gene_1_coverage', 'gene_2_coverage', 'pair_coverage']
for col in percentage_cols:
    coverage_by_cell_line[col] = coverage_by_cell_line[col].apply(lambda x: f"{x:.2%}")

# 显示结果
print("按cell_line_origin分组的覆盖率统计:")
print(coverage_by_cell_line)

# 可选：保存结果到CSV
# coverage_by_cell_line.to_csv('coverage_by_cell_line.csv', index=False)\
# 选择要显示的列
cols_to_show = ['cell_line_origin'] + percentage_cols
print(coverage_by_cell_line[cols_to_show])

按cell_line_origin分组的覆盖率统计:
   cell_line_origin  gene_1_unique_count  gene_1_covered_count  \
0             22RV1                 49.0                  49.0   
1              293T                 72.0                  72.0   
2              786O                 24.0                  24.0   
3              A375                923.0                 902.0   
4              A549               2160.0                2136.0   
5               GI1               2092.0                2068.0   
6              HELA               1140.0                1126.0   
7            HS936T               2092.0                2068.0   
8            HS944T               2092.0                2068.0   
9              HSC5               2092.0                2068.0   
10             HT29                 24.0                  24.0   
11           IPC298               2092.0                2068.0   
12           JURKAT                387.0                 358.0   
13             K562                644.0         

  coverage_by_cell_line = sl_data.groupby('cell_line_origin').apply(calculate_coverage_stats).reset_index()
