添加文件

In [1]:
# 1. 导入 pandas 库
# pandas 是一个强大的数据分析库，我们用它来读取和处理表格数据（比如 .tsv 文件）。
import pandas as pd

# 2. 定义文件路径
# 因为我们的代码在 'scripts/preprocessing' 文件夹中运行，
# 所以需要先用 '../../' 返回到项目的根目录，
# 然后再指定 'data/raw_data/' 文件夹来找到我们的文件。
pg_matrix_path = '../../data/raw_data/report.pg_matrix.tsv'
pr_matrix_path = '../../data/raw_data/report.pr_matrix.tsv'

# 3. 读取 TSV 文件到 pandas DataFrame
# 我们使用 pd.read_csv() 函数来读取文件。
# 因为文件是制表符分隔的 (Tab-Separated Values, TSV)，
# 我们需要明确指定分隔符是 '\t'。
try:
    protein_df = pd.read_csv(pg_matrix_path, sep='\t')
    peptide_df = pd.read_csv(pr_matrix_path, sep='\t')

    # 4. 显示蛋白质组 (Protein Group) 矩阵的前5行
    # .head() 是一个非常方便的函数，默认可以查看 DataFrame 的前5行，帮助我们快速了解数据的大致结构。
    print("蛋白质组矩阵 (report.pg_matrix.tsv) 的前5行:")
    display(protein_df.head())

    # 5. 显示肽段 (Precursor) 矩阵的前5行
    print("\n肽段矩阵 (report.pr_matrix.tsv) 的前5行:")
    display(peptide_df.head())

except FileNotFoundError as e:
    print(f"文件未找到错误: {e}")
    print("\n请确认以下几点：")
    print("1. 您的 Jupyter Notebook 是否确实保存在 'scripts/preprocessing/' 文件夹中？")
    print("2. 'data' 和 'scripts' 文件夹是否在同一个项目根目录下？")
    print("3. 文件名 'report.pg_matrix.tsv' 和 'report.pr_matrix.tsv' 是否正确？")

蛋白质组矩阵 (report.pg_matrix.tsv) 的前5行:


Unnamed: 0,Protein.Group,Protein.Names,Genes,First.Protein.Description,N.Sequences,N.Proteotypic.Sequences,D:\20250902_SP_pre_experiment_1_data\JDL_1_RC7_1_32932.d,D:\20250902_SP_pre_experiment_1_data\JDL_2_RC8_1_32933.d,D:\20250902_SP_pre_experiment_1_data\JDL_3_RD1_1_32934.d,D:\20250902_SP_pre_experiment_1_data\JDL_4_RD2_1_32935.d
0,A0A075B5N3;A0A0G2JE47;Q6KB05,A0A075B5N3_MOUSE;A0A0G2JE47_MOUSE;Q6KB05_MOUSE,Igkv8-28;scFv,,1,0,,9172.38,314348.0,14467.6
1,A0A075B5P5;A0A1Y7VJN6;A0A4U9FKB1;Q6KAM2,A0A075B5P5_MOUSE;A0A1Y7VJN6_MOUSE;A0A4U9FKB1_M...,Ighg3,,1,1,18983.3,,,22708.1
2,A0A087WNW3;A0A087WP14;A0A087WP48;A0A087WP85;A0...,A0A087WNW3_MOUSE;A0A087WP14_MOUSE;A0A087WP48_M...,Ktn1,,9,7,125996.0,143374.0,,169145.0
3,A0A087WP24;A0A087WPF8;A0A087WRJ2;A0A087WSR2;E9...,A0A087WP24_MOUSE;A0A087WPF8_MOUSE;A0A087WRJ2_M...,Abhd14b,,4,4,28182.6,32479.3,52088.3,39553.1
4,A0A087WPC9,A0A087WPC9_MOUSE,Dnah7b,,1,1,,16579.9,68077.7,17971.0



肽段矩阵 (report.pr_matrix.tsv) 的前5行:


Unnamed: 0,Protein.Group,Protein.Ids,Protein.Names,Genes,First.Protein.Description,Proteotypic,Stripped.Sequence,Modified.Sequence,Precursor.Charge,Precursor.Id,D:\20250902_SP_pre_experiment_1_data\JDL_1_RC7_1_32932.d,D:\20250902_SP_pre_experiment_1_data\JDL_2_RC8_1_32933.d,D:\20250902_SP_pre_experiment_1_data\JDL_3_RD1_1_32934.d,D:\20250902_SP_pre_experiment_1_data\JDL_4_RD2_1_32935.d
0,P55012,P55012;E9QM38,S12A2_MOUSE,Slc12a2,,1,AAAAAAAAAAAAAAAGAAGK,AAAAAAAAAAAAAAAGAAGK,3,AAAAAAAAAAAAAAAGAAGK3,9735.86,,,17339.5
1,Q3TW96,Q3TW96;Q8BVK3,UAP1L_MOUSE,Uap1l1,,1,AAAAGALAPGPLPDLAAR,AAAAGALAPGPLPDLAAR,2,AAAAGALAPGPLPDLAAR2,62955.7,45491.4,,67201.5
2,Q80X85,Q80X85,RT07_MOUSE,Mrps7,,1,AAAATETSSVFADPVISK,AAAATETSSVFADPVISK,2,AAAATETSSVFADPVISK2,23096.4,18996.0,,17385.5
3,E9PX29;Q8VIE5,E9PX29;E9PZC2;Q8VBX2;Q8VIE5;Q91ZE6,E9PX29_MOUSE;Q8VIE5_MOUSE,Sptbn4,,1,AAAAWEER,AAAAWEER,2,AAAAWEER2,,,,15692.2
4,E9Q6U4;Q3UWL8,E9Q6U4;Q3UWL8,E9Q6U4_MOUSE;Q3UWL8_MOUSE,Pfdn4,,1,AAAEDVNVTFEDQQK,AAAEDVNVTFEDQQK,2,AAAEDVNVTFEDQQK2,46920.4,51620.6,301061.0,47975.8
