In [1]:
import pandas as pd

In [2]:
rna_featurecounts_df = pd.read_csv('./7_featurecounts_rna/gene_exon_counts.txt', sep='\t', comment='#')

In [3]:
rna_featurecounts_df

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,./4_hisat2/MAPQ20/BE2-shMETTL14-1-IP.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-1-input.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-2-IP.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-2-input.bam,./4_hisat2/MAPQ20/BE2-shNC-1-IP.bam,./4_hisat2/MAPQ20/BE2-shNC-1-input.bam,./4_hisat2/MAPQ20/BE2-shNC-2-IP.bam,./4_hisat2/MAPQ20/BE2-shNC-2-input.bam
0,ENSG00000279928,1;1;1;1;1,182696;183132;183494;183740;183981,182746;183216;183571;183901;184174,+;+;+;+;+,570,6,2,2,1,0,0,0,1
1,ENSG00000228037,1;1;1,2581560;2583370;2584125,2581650;2583495;2584533,+;+;+,626,2,0,0,0,0,0,1,2
2,ENSG00000142611,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,3069168;3069183;3069197;3069203;3069211;318612...,3069296;3069296;3069296;3069296;3069296;318647...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;...,9767,0,0,0,2,2,1,1,0
3,ENSG00000284616,1;1;1;1,5301928;5303328;5304401;5306942,5302004;5303393;5305029;5307394,-;-;-;-,1225,0,0,0,0,0,0,0,0
4,ENSG00000157911,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,2403964;2403974;2404061;2404613;2404838;240503...,2405834;2405834;2404371;2404891;2405828;240583...,-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;...,3782,120,253,108,264,141,246,99,225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63182,ENSG00000198695,MT,14149,14673,-,525,46645,22316,52088,23902,85246,20887,52840,21499
63183,ENSG00000210194,MT,14674,14742,-,69,284,229,334,209,364,170,312,133
63184,ENSG00000198727,MT,14747,15887,+,1141,8487,15906,9493,15818,11670,11431,8937,11647
63185,ENSG00000210195,MT,15888,15953,+,66,3,5,11,4,5,5,4,9


In [4]:
rna_featurecounts_df["Geneid"].nunique()

63187

In [5]:
# 提取counts数据和基因长度
gene_lengths = rna_featurecounts_df['Length']
counts = rna_featurecounts_df.iloc[:, 6:]  # 从第七列开始是样本的counts数据

# 筛选包含 "input.bam" 的列
input_counts = counts.loc[:, counts.columns.str.contains('input.bam')]

In [6]:
# 计算每个样本的总mapped reads
total_counts = input_counts.sum(axis=0)

# 计算FPKM
fpkm = input_counts.div(gene_lengths, axis=0)
fpkm = fpkm.div(total_counts / 1e6, axis=1)

In [7]:
# 将 FPKM 数据框的列名加上前缀 "FPKM_"
fpkm.columns = ['FPKM_' + col.split('/')[-1] for col in fpkm.columns]

In [8]:
fpkm

Unnamed: 0,FPKM_BE2-shMETTL14-1-input.bam,FPKM_BE2-shMETTL14-2-input.bam,FPKM_BE2-shNC-1-input.bam,FPKM_BE2-shNC-2-input.bam
0,0.002050,0.000993,0.000000,0.001170
1,0.000000,0.000000,0.000000,0.002131
2,0.000000,0.000116,0.000070,0.000000
3,0.000000,0.000000,0.000000,0.000000
4,0.039084,0.039496,0.044375,0.039680
...,...,...,...,...
63182,24.834405,25.760212,27.141790,27.312991
63183,1.939024,1.713846,1.680821,1.285621
63184,8.144643,7.844051,6.834713,6.808301
63185,0.044261,0.034292,0.051683,0.090951


In [9]:
# 合并 FPKM 数据到原始 DataFrame
rna_featurecounts_df = pd.concat([rna_featurecounts_df, fpkm], axis=1)

#### 读取hg38 gene dict

In [10]:
df_hg38 = pd.read_csv("./hg38_111_annotate.txt", sep=' ', header=None, names=["id", "name", "type"])
# filtered_df_hg38 = df_hg38[df_hg38["type"] == "protein_coding"]
filtered_df_hg38 = df_hg38

In [11]:
filtered_df_hg38

Unnamed: 0,id,name,type
0,ENSG00000279928,DDX11L17,unprocessed_pseudogene
1,ENSG00000228037,lncRNA,lncRNA
2,ENSG00000142611,PRDM16,protein_coding
3,ENSG00000284616,lncRNA,lncRNA
4,ENSG00000157911,PEX10,protein_coding
...,...,...,...
62347,ENSG00000198695,MT-ND6,protein_coding
62348,ENSG00000210194,MT-TE,Mt_tRNA
62349,ENSG00000198727,MT-CYB,protein_coding
62350,ENSG00000210195,MT-TT,Mt_tRNA


In [12]:
rna_featurecounts_df

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,./4_hisat2/MAPQ20/BE2-shMETTL14-1-IP.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-1-input.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-2-IP.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-2-input.bam,./4_hisat2/MAPQ20/BE2-shNC-1-IP.bam,./4_hisat2/MAPQ20/BE2-shNC-1-input.bam,./4_hisat2/MAPQ20/BE2-shNC-2-IP.bam,./4_hisat2/MAPQ20/BE2-shNC-2-input.bam,FPKM_BE2-shMETTL14-1-input.bam,FPKM_BE2-shMETTL14-2-input.bam,FPKM_BE2-shNC-1-input.bam,FPKM_BE2-shNC-2-input.bam
0,ENSG00000279928,1;1;1;1;1,182696;183132;183494;183740;183981,182746;183216;183571;183901;184174,+;+;+;+;+,570,6,2,2,1,0,0,0,1,0.002050,0.000993,0.000000,0.001170
1,ENSG00000228037,1;1;1,2581560;2583370;2584125,2581650;2583495;2584533,+;+;+,626,2,0,0,0,0,0,1,2,0.000000,0.000000,0.000000,0.002131
2,ENSG00000142611,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,3069168;3069183;3069197;3069203;3069211;318612...,3069296;3069296;3069296;3069296;3069296;318647...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;...,9767,0,0,0,2,2,1,1,0,0.000000,0.000116,0.000070,0.000000
3,ENSG00000284616,1;1;1;1,5301928;5303328;5304401;5306942,5302004;5303393;5305029;5307394,-;-;-;-,1225,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
4,ENSG00000157911,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,2403964;2403974;2404061;2404613;2404838;240503...,2405834;2405834;2404371;2404891;2405828;240583...,-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;...,3782,120,253,108,264,141,246,99,225,0.039084,0.039496,0.044375,0.039680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63182,ENSG00000198695,MT,14149,14673,-,525,46645,22316,52088,23902,85246,20887,52840,21499,24.834405,25.760212,27.141790,27.312991
63183,ENSG00000210194,MT,14674,14742,-,69,284,229,334,209,364,170,312,133,1.939024,1.713846,1.680821,1.285621
63184,ENSG00000198727,MT,14747,15887,+,1141,8487,15906,9493,15818,11670,11431,8937,11647,8.144643,7.844051,6.834713,6.808301
63185,ENSG00000210195,MT,15888,15953,+,66,3,5,11,4,5,5,4,9,0.044261,0.034292,0.051683,0.090951


In [13]:
# 使用merge函数根据Geneid和id进行左连接合并，这样可以保留rna_featurecounts_df中所有行，即使没有匹配的id
merged_df = pd.merge(rna_featurecounts_df, filtered_df_hg38, left_on='Geneid', right_on='id', how='left')

# 选择需要的列，并重命名name列为Gene
result_df = merged_df.rename(columns={'name': 'Gene'})

In [14]:
result_df

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,./4_hisat2/MAPQ20/BE2-shMETTL14-1-IP.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-1-input.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-2-IP.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-2-input.bam,...,./4_hisat2/MAPQ20/BE2-shNC-1-input.bam,./4_hisat2/MAPQ20/BE2-shNC-2-IP.bam,./4_hisat2/MAPQ20/BE2-shNC-2-input.bam,FPKM_BE2-shMETTL14-1-input.bam,FPKM_BE2-shMETTL14-2-input.bam,FPKM_BE2-shNC-1-input.bam,FPKM_BE2-shNC-2-input.bam,id,Gene,type
0,ENSG00000279928,1;1;1;1;1,182696;183132;183494;183740;183981,182746;183216;183571;183901;184174,+;+;+;+;+,570,6,2,2,1,...,0,0,1,0.002050,0.000993,0.000000,0.001170,ENSG00000279928,DDX11L17,unprocessed_pseudogene
1,ENSG00000228037,1;1;1,2581560;2583370;2584125,2581650;2583495;2584533,+;+;+,626,2,0,0,0,...,0,1,2,0.000000,0.000000,0.000000,0.002131,ENSG00000228037,lncRNA,lncRNA
2,ENSG00000142611,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,3069168;3069183;3069197;3069203;3069211;318612...,3069296;3069296;3069296;3069296;3069296;318647...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;...,9767,0,0,0,2,...,1,1,0,0.000000,0.000116,0.000070,0.000000,ENSG00000142611,PRDM16,protein_coding
3,ENSG00000284616,1;1;1;1,5301928;5303328;5304401;5306942,5302004;5303393;5305029;5307394,-;-;-;-,1225,0,0,0,0,...,0,0,0,0.000000,0.000000,0.000000,0.000000,ENSG00000284616,lncRNA,lncRNA
4,ENSG00000157911,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,2403964;2403974;2404061;2404613;2404838;240503...,2405834;2405834;2404371;2404891;2405828;240583...,-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;...,3782,120,253,108,264,...,246,99,225,0.039084,0.039496,0.044375,0.039680,ENSG00000157911,PEX10,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63182,ENSG00000198695,MT,14149,14673,-,525,46645,22316,52088,23902,...,20887,52840,21499,24.834405,25.760212,27.141790,27.312991,ENSG00000198695,MT-ND6,protein_coding
63183,ENSG00000210194,MT,14674,14742,-,69,284,229,334,209,...,170,312,133,1.939024,1.713846,1.680821,1.285621,ENSG00000210194,MT-TE,Mt_tRNA
63184,ENSG00000198727,MT,14747,15887,+,1141,8487,15906,9493,15818,...,11431,8937,11647,8.144643,7.844051,6.834713,6.808301,ENSG00000198727,MT-CYB,protein_coding
63185,ENSG00000210195,MT,15888,15953,+,66,3,5,11,4,...,5,4,9,0.044261,0.034292,0.051683,0.090951,ENSG00000210195,MT-TT,Mt_tRNA


In [15]:
result_df = result_df[result_df["type"] == "protein_coding"]

In [16]:
result_df

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,./4_hisat2/MAPQ20/BE2-shMETTL14-1-IP.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-1-input.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-2-IP.bam,./4_hisat2/MAPQ20/BE2-shMETTL14-2-input.bam,...,./4_hisat2/MAPQ20/BE2-shNC-1-input.bam,./4_hisat2/MAPQ20/BE2-shNC-2-IP.bam,./4_hisat2/MAPQ20/BE2-shNC-2-input.bam,FPKM_BE2-shMETTL14-1-input.bam,FPKM_BE2-shMETTL14-2-input.bam,FPKM_BE2-shNC-1-input.bam,FPKM_BE2-shNC-2-input.bam,id,Gene,type
2,ENSG00000142611,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,3069168;3069183;3069197;3069203;3069211;318612...,3069296;3069296;3069296;3069296;3069296;318647...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;...,9767,0,0,0,2,...,1,1,0,0.000000,0.000116,0.000070,0.000000,ENSG00000142611,PRDM16,protein_coding
4,ENSG00000157911,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,2403964;2403974;2404061;2404613;2404838;240503...,2405834;2405834;2404371;2404891;2405828;240583...,-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;...,3782,120,253,108,264,...,246,99,225,0.039084,0.039496,0.044375,0.039680,ENSG00000157911,PEX10,protein_coding
9,ENSG00000142655,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1,10472288;10474950;10474982;10495274;10495274;1...,10472580;10475002;10475002;10495321;10495321;1...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+,2932,129,284,103,275,...,232,113,294,0.056591,0.053069,0.053982,0.066880,ENSG00000142655,PEX14,protein_coding
13,ENSG00000149527,1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;...,2425980;2430338;2467459;2476289;2476315;247847...,2426037;2430629;2467662;2476712;2476712;247862...,+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;...,10809,15,20,18,17,...,18,9,16,0.001081,0.000890,0.001136,0.000987,ENSG00000149527,PLCH2,protein_coding
15,ENSG00000171621,1;1;1;1;1;1;1;1;1;1,9292894;9294528;9351431;9355707;9355743;935574...,9293071;9294594;9351610;9356585;9356585;935658...,+;+;+;+;+;+;+;+;+;+,3389,0,0,0,0,...,0,0,0,0.000000,0.000000,0.000000,0.000000,ENSG00000171621,SPSB1,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63176,ENSG00000212907,MT,10470,10766,+,297,1406,2147,1572,2016,...,1912,1660,1944,4.223498,3.840684,4.391907,4.365662,ENSG00000212907,MT-ND4L,protein_coding
63177,ENSG00000198886,MT,10760,12137,+,1378,15946,38400,16922,36398,...,30342,17845,31040,16.280911,14.945236,15.021618,15.023904,ENSG00000198886,MT-ND4,protein_coding
63181,ENSG00000198786,MT,12337,14148,+,1812,36020,65638,49267,80697,...,48787,37823,52094,21.163810,25.198442,18.368244,19.175196,ENSG00000198786,MT-ND5,protein_coding
63182,ENSG00000198695,MT,14149,14673,-,525,46645,22316,52088,23902,...,20887,52840,21499,24.834405,25.760212,27.141790,27.312991,ENSG00000198695,MT-ND6,protein_coding


In [17]:
# 处理'Chr'列，取第一个字符并添加'chr'前缀
result_df['Chr'] = 'chr' + result_df['Chr'].str.split(';').str[0]

# 处理'Strand'列，取第一个字符
result_df['Strand'] = result_df['Strand'].str.split(';').str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['Chr'] = 'chr' + result_df['Chr'].str.split(';').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['Strand'] = result_df['Strand'].str.split(';').str[0]


In [18]:
result_df = result_df[["Gene", "Length", "Chr", "Strand", "FPKM_BE2-shMETTL14-1-input.bam", "FPKM_BE2-shMETTL14-2-input.bam", "FPKM_BE2-shNC-1-input.bam", "FPKM_BE2-shNC-2-input.bam"]]

In [19]:
# 保存结果到 CSV 文件
output_file = './7_featurecounts_rna/gene_exon_counts_with_fpkm.csv'
result_df.to_csv(output_file, sep='\t', index=False)

print(f"FPKM calculation completed. Output written to {output_file}")

FPKM calculation completed. Output written to ./7_featurecounts_rna/gene_exon_counts_with_fpkm.csv
