In [1]:
import pandas as pd

# 指定 SOFT 文件路径
file_path = "GSE176265_family.soft"  # 请修改为你的文件路径

# 读取 SOFT 文件内容
with open(file_path, "r", encoding="utf-8") as file:
    soft_lines = file.readlines()

# 查找数据部分（通常以 "ID_REF" 开头）
data_start_idx = None
for i, line in enumerate(soft_lines):
    if line.startswith("ID_REF"):
        data_start_idx = i
        break

# 提取数据表并转换为 DataFrame
if data_start_idx is not None:
    # 分割数据部分
    data_content = [line.strip().split("\t") for line in soft_lines[data_start_idx:]]

    # 创建 DataFrame
    df = pd.DataFrame(data_content[1:], columns=data_content[0])

    # 转换 VALUE 列为数值类型（如果存在）
    if "VALUE" in df.columns:
        df["VALUE"] = pd.to_numeric(df["VALUE"], errors="coerce")

    # 显示 DataFrame 基本信息
    print("数据集信息：")
    print(df.info())

    # 显示前几行
    print("\n数据预览：")
    print(df.head())
else:
    print("未找到数据表部分")


数据集信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087757 entries, 0 to 1087756
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   ID_REF          1087757 non-null  object 
 1   VALUE           1086744 non-null  float64
 2   Detection Pval  1086767 non-null  object 
dtypes: float64(1), object(2)
memory usage: 24.9+ MB
None

数据预览：
         ID_REF     VALUE Detection Pval
0  ILMN_2735294  1.957591      0.1720085
1  ILMN_2417611  1.505660      0.7735043
2  ILMN_2545897  2.226385     0.04807692
3  ILMN_2762289  1.865122      0.2382479
4  ILMN_1248788  1.621418      0.5576923


In [2]:
df

Unnamed: 0,ID_REF,VALUE,Detection Pval
0,ILMN_2735294,1.957591,0.1720085
1,ILMN_2417611,1.505660,0.7735043
2,ILMN_2545897,2.226385,0.04807692
3,ILMN_2762289,1.865122,0.2382479
4,ILMN_1248788,1.621418,0.5576923
...,...,...,...
1087752,ILMN_2432039,2.604282,0.007478632
1087753,ILMN_2475617,2.069228,0.142094
1087754,ILMN_2432040,2.356303,0.03739316
1087755,ILMN_2424408,2.190243,0.08440171


In [3]:
import pandas as pd

# 文件路径
file_path = "GSE176265_family.soft"  # 请修改为你的文件路径

# 读取文件内容
with open(file_path, "r", encoding="utf-8") as file:
    soft_lines = file.readlines()

# 解析样本信息部分
sample_info = {}
sample_section = False

for line in soft_lines:
    line = line.strip()
    
    # 进入样本信息部分
    if line.startswith("^SAMPLE"):
        current_sample = line.split(" = ")[1]  # 记录当前样本编号
        sample_info[current_sample] = {}
        sample_section = True
        continue

    # 解析样本的具体信息
    if sample_section and line.startswith("!Sample_"):
        key, value = line.split(" = ")
        sample_info[current_sample][key] = value

# 转换样本信息为 DataFrame
sample_df = pd.DataFrame.from_dict(sample_info, orient="index")

# 查找基因表达数据部分（通常以 "ID_REF" 开头）
data_start_idx = None
for i, line in enumerate(soft_lines):
    if line.startswith("ID_REF"):
        data_start_idx = i
        break

# 提取基因表达数据
if data_start_idx is not None:
    data_content = [line.strip().split("\t") for line in soft_lines[data_start_idx:]]
    expression_df = pd.DataFrame(data_content[1:], columns=data_content[0])

    # 转换 VALUE 列为数值类型
    expression_df.iloc[:, 1:] = expression_df.iloc[:, 1:].apply(pd.to_numeric, errors="coerce")

    # 获取样本对应的 GEO ID
    expression_samples = expression_df.columns[1:]

    # 匹配样本信息（获取时间点、实验分组）
    matched_samples = sample_df.loc[sample_df["!Sample_geo_accession"].isin(expression_samples), ["!Sample_geo_accession", "!Sample_title"]]

    # 重新整理数据：按基因、时间点、实验组
    melted_df = expression_df.melt(id_vars=["ID_REF"], var_name="Sample", value_name="Expression")
    merged_df = melted_df.merge(matched_samples, left_on="Sample", right_on="!Sample_geo_accession", how="left")

    # 处理缺失值，防止 TypeError
    merged_df["!Sample_title"] = merged_df["!Sample_title"].fillna("Unknown")

    # 添加时间点信息
    merged_df["Time"] = merged_df["!Sample_title"].str.extract(r'(\d+h)')  

    # 添加实验条件信息
    merged_df["Condition"] = merged_df["!Sample_title"].apply(lambda x: "RANKL" if "RANKL" in x else "Control")

    # 显示最终整理后的数据
    print("数据集整理成功！\n")
    print(merged_df.head())

    # 保存到 CSV 文件
    ##print("\n已保存整理后的数据至 'Processed_GEO_Data.csv'")
else:
    print("未找到基因表达数据部分")


数据集整理成功！

         ID_REF Sample Expression !Sample_geo_accession !Sample_title Time  \
0  ILMN_2735294  VALUE   1.957591                   NaN       Unknown  NaN   
1  ILMN_2417611  VALUE    1.50566                   NaN       Unknown  NaN   
2  ILMN_2545897  VALUE   2.226385                   NaN       Unknown  NaN   
3  ILMN_2762289  VALUE   1.865122                   NaN       Unknown  NaN   
4  ILMN_1248788  VALUE   1.621418                   NaN       Unknown  NaN   

  Condition  
0   Control  
1   Control  
2   Control  
3   Control  
4   Control  


In [4]:
merged_df

Unnamed: 0,ID_REF,Sample,Expression,!Sample_geo_accession,!Sample_title,Time,Condition
0,ILMN_2735294,VALUE,1.957591,,Unknown,,Control
1,ILMN_2417611,VALUE,1.50566,,Unknown,,Control
2,ILMN_2545897,VALUE,2.226385,,Unknown,,Control
3,ILMN_2762289,VALUE,1.865122,,Unknown,,Control
4,ILMN_1248788,VALUE,1.621418,,Unknown,,Control
...,...,...,...,...,...,...,...
2175509,ILMN_2432039,Detection Pval,0.007479,,Unknown,,Control
2175510,ILMN_2475617,Detection Pval,0.142094,,Unknown,,Control
2175511,ILMN_2432040,Detection Pval,0.037393,,Unknown,,Control
2175512,ILMN_2424408,Detection Pval,0.084402,,Unknown,,Control


In [5]:
merged_df['Sample'].nunique()

2

In [6]:
merged_df['!Sample_geo_accession'].nunique()

0

In [7]:
merged_df['!Sample_title'].nunique()

1

In [8]:
set(merged_df['!Sample_title'].tolist())

{'Unknown'}

In [9]:
merged_df['Time'].nunique()

0

In [10]:
merged_df['Condition'].nunique()

1

In [15]:
import pandas as pd

# 文件路径
file_path = "GSE176265_family.soft"  # 请修改为你的文件路径

# 读取文件内容
with open(file_path, "r", encoding="utf-8") as file:
    soft_lines = file.readlines()

# 解析样本信息部分
sample_info = {}
sample_section = False

for line in soft_lines:
    line = line.strip()
    
    # 进入样本信息部分
    if line.startswith("^SAMPLE"):
        current_sample = line.split(" = ")[1]  # 记录当前样本编号
        sample_info[current_sample] = {}
        sample_section = True
        continue

    # 解析样本的具体信息
    if sample_section and line.startswith("!Sample_"):
        key, value = line.split(" = ")
        sample_info[current_sample][key] = value

# 转换样本信息为 DataFrame
sample_df = pd.DataFrame.from_dict(sample_info, orient="index")

# 查找基因表达数据部分（通常以 "ID_REF" 开头）
data_start_idx = None
for i, line in enumerate(soft_lines):
    if line.startswith("ID_REF"):
        data_start_idx = i
        break

# 提取基因表达数据
if data_start_idx is not None:
    data_content = [line.strip().split("\t") for line in soft_lines[data_start_idx:]]
    expression_df = pd.DataFrame(data_content[1:], columns=data_content[0])

    # 确保 VALUE 列为数值类型
    expression_df.iloc[:, 1:] = expression_df.iloc[:, 1:].apply(pd.to_numeric, errors="coerce")

    # 获取基因表达数据的样本列（GEO 样本 ID）
    expression_samples = expression_df.columns[1:]

    # 在样本信息表 `sample_df` 中找到匹配的 GEO 样本 ID
    matched_samples = sample_df[sample_df["!Sample_geo_accession"].isin(expression_samples)][["!Sample_geo_accession", "!Sample_title"]]

    # 检查是否成功匹配
    print("\n已找到的 GEO 样本 ID 数量:", len(matched_samples))
    print("\n匹配的样本信息（前5行）:\n", matched_samples.head())

    # 保存匹配的样本信息到 CSV
    matched_samples.to_csv("Matched_GEO_Samples.csv", index=False)
    print("\n匹配的样本信息已保存至 'Matched_GEO_Samples.csv'")

else:
    print("未找到基因表达数据部分")



已找到的 GEO 样本 ID 数量: 0

匹配的样本信息（前5行）:
 Empty DataFrame
Columns: [!Sample_geo_accession, !Sample_title]
Index: []

匹配的样本信息已保存至 'Matched_GEO_Samples.csv'


In [1]:
import pandas as pd

# 修改为你的本地 SOFT 文件路径
file_path = "GSE176265_family.soft"

with open(file_path, "r", encoding="utf-8") as file:
    soft_lines = file.readlines()

# 识别 Platform Table 的开始和结束位置
platform_start_idx = None
platform_end_idx = None

for i, line in enumerate(soft_lines):
    if line.startswith("!platform_table_begin"):
        platform_start_idx = i
    if line.startswith("!platform_table_end"):
        platform_end_idx = i
        break

# 提取 Platform Table 数据
if platform_start_idx is not None and platform_end_idx is not None:
    platform_data = [line.strip().split("\t") for line in soft_lines[platform_start_idx + 1: platform_end_idx]]
    platform_df = pd.DataFrame(platform_data[1:], columns=platform_data[0])

    # 显示数据基本信息
    print(platform_df.info())
    print(platform_df.head())

    # 保存数据到 CSV
    platform_df.to_csv("Platform_Table.csv", index=False)
    print("\nPlatform Table 数据已保存至 'Platform_Table.csv'")
else:
    print("未找到 Platform Table")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45281 entries, 0 to 45280
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ID                     45281 non-null  object
 1   Species                45281 non-null  object
 2   Source                 45281 non-null  object
 3   Search_Key             45281 non-null  object
 4   Transcript             45281 non-null  object
 5   ILMN_Gene              45281 non-null  object
 6   Source_Reference_ID    45281 non-null  object
 7   RefSeq_ID              45281 non-null  object
 8   Entrez_Gene_ID         45281 non-null  object
 9   GI                     45281 non-null  object
 10  Accession              45281 non-null  object
 11  Symbol                 45281 non-null  object
 12  Protein_Product        45281 non-null  object
 13  Probe_Id               45281 non-null  object
 14  Array_Address_Id       45281 non-null  object
 15  Probe_Type         

In [2]:
platform_df

Unnamed: 0,ID,Species,Source,Search_Key,Transcript,ILMN_Gene,Source_Reference_ID,RefSeq_ID,Entrez_Gene_ID,GI,...,Probe_Coordinates,Cytoband,Definition,Ontology_Component,Ontology_Process,Ontology_Function,Synonyms,Obsolete_Probe_Id,GB_ACC,ORF
0,ILMN_1243094,Mus musculus,Riken,ILMN_204164,ILMN_204164,THRSP,ri|C730035M01|PX00087M15|AK050300|1404,,,,...,,,,A membrane-bounded organelle of eukaryotic cel...,,,,,AK050300,THRSP
1,ILMN_1238674,Mus musculus,RefSeq,ILMN_188674,ILMN_245303,2700007P21RIK,NM_173750.2,NM_173750.2,212772,68299772,...,106804608-106804657,2qE3,Mus musculus RIKEN cDNA 2700007P21 gene (27000...,,,,4930448O08Rik; RP23-12N7.2,4930448O08Rik; RP23-12N7.2,NM_173750.2,2700007P21RIK
2,ILMN_2454720,Mus musculus,RefSeq,ILMN_188674,ILMN_245303,2700007P21RIK,NM_173750.2,NM_173750.2,212772,68299772,...,106802799-106802848,2qE3,Mus musculus RIKEN cDNA 2700007P21 gene (27000...,,,,4930448O08Rik; RP23-12N7.2,4930448O08Rik; RP23-12N7.2,NM_173750.2,2700007P21RIK
3,ILMN_3062534,Mus musculus,RefSeq,ILMN_245303,ILMN_245303,2700007P21RIK,NM_173750.2,NM_173750.2,212772,68299772,...,106810479-106810528,2qE3,Mus musculus RIKEN cDNA 2700007P21 gene (27000...,,,,4930448O08Rik; RP23-12N7.2,4930448O08Rik; RP23-12N7.2,NM_173750.2,2700007P21RIK
4,ILMN_3140158,Mus musculus,RefSeq,ILMN_245303,ILMN_245303,2700007P21RIK,NM_173750.2,NM_173750.2,212772,68299772,...,106803057-106803106,2qE3,Mus musculus RIKEN cDNA 2700007P21 gene (27000...,,,,4930448O08Rik; RP23-12N7.2,4930448O08Rik; RP23-12N7.2,NM_173750.2,2700007P21RIK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45276,ILMN_2592571,Mus musculus,RefSeq,ILMN_209103,ILMN_231998,DNAJC28,NM_138664.2,NM_138664.2,246738,153267501,...,,16qC3.3,"Mus musculus DnaJ (Hsp40) homolog, subfamily C...",,,Interacting selectvely with a heat shock prote...,BC020175; MGC27620,BC020175; MGC27620,NM_138664.2,DNAJC28
45277,ILMN_2659025,Mus musculus,MEEBO,ILMN_215400,ILMN_215400,MAPKAP1,scl00227743.1_265,NM_177345.2,,31340903,...,,,,"A vesicle formed of membrane or protein, found...",,Catalysis of the transfer of a phosphate group...,,,NM_177345.2,MAPKAP1
45278,ILMN_1221466,Mus musculus,Riken,ILMN_202341,ILMN_202341,1200003N10RIK,ri|1200003N10|R000008O13|AK004585|1268,,,,...,,,,,,,,,AK004585,1200003N10RIK
45279,ILMN_2607075,Mus musculus,RefSeq,ILMN_210581,ILMN_210581,OLFR692,NM_146355.1,NM_146355.1,258352,22129592,...,112517750-112517799,7qE3,"Mus musculus olfactory receptor 692 (Olfr692),...",Double layer of lipid molecules that encloses ...,The cascade of processes by which a signal int...,A receptor that binds an extracellular ligand ...,MOR36-1,MOR36-1,NM_146355.1,OLFR692
