In [5]:
import os
import pandas as pd

folder_path = 'small_expecto' 

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

for file_name in csv_files:
    file_path = os.path.join(folder_path, file_name)  
    
    df = pd.read_csv(file_path)
    df[['CHROM', 'POS', 'REF', 'ALT', 'b38']] = df['variant_id'].str.split('_', expand=True)
    df.drop(columns=['b38', 'variant_id'], inplace=True)
    df.to_csv(file_path, index=False)
    
    print(f"Processed file saved: {file_path}")


Processed file saved: small_expecto/train_small_Breast_Mammary_Tissue.csv
Processed file saved: small_expecto/train_small_Artery_Tibial.csv
Processed file saved: small_expecto/train_small_Thyroid.csv
Processed file saved: small_expecto/train_small_Adipose_Subcutaneous.csv
Processed file saved: small_expecto/test_small_Artery_Tibial.csv
Processed file saved: small_expecto/test_small_Colon_Transverse.csv
Processed file saved: small_expecto/test_small_Adipose_Subcutaneous.csv
Processed file saved: small_expecto/test_small_Thyroid.csv
Processed file saved: small_expecto/train_small_Nerve_Tibial.csv
Processed file saved: small_expecto/test_small_Nerve_Tibial.csv
Processed file saved: small_expecto/train_small_Colon_Transverse.csv
Processed file saved: small_expecto/test_small_Breast_Mammary_Tissue.csv


In [6]:
from pyliftover import LiftOver

lo = LiftOver('hg38ToHg19.over.chain.gz')

def build_gene_index(gtf_file_path="hg19.knownGene.gtf"):
    gene_index = {}
    
    with open(gtf_file_path, "r") as f:
        for line in f:
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            chrom = fields[0]
            start, end = int(fields[3]), int(fields[4])
            gene_id = fields[8].split('gene_id "')[1].split('"')[0]  
            if chrom not in gene_index:
                gene_index[chrom] = []
            gene_index[chrom].append((start, end, gene_id))
    
    return gene_index


def get_gene_id_from_index(chrom, position, gene_index):
    if chrom in gene_index:
        for start, end, gene_id in gene_index[chrom]:
            if start <= position <= end:
                return gene_id
    return None

gene_index = build_gene_index("hg19.knownGene.gtf")

def process_csv_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            print(f" {file_path}")
            
            df = pd.read_csv(file_path)

            new_positions = []
            new_phenotype_ids = []

            for index, row in df.iterrows():
                hg38_coords = (row['CHROM'], int(row['POS']))
                hg19_coords = lo.convert_coordinate(hg38_coords[0], hg38_coords[1])

                if hg19_coords:  
                    new_pos = hg19_coords[0][1]  
                    new_positions.append(new_pos)

                
                    gene_id = get_gene_id_from_index(hg38_coords[0], new_pos, gene_index)
                    new_phenotype_ids.append(gene_id)
                else:
                    new_positions.append(None)
                    new_phenotype_ids.append(None)

            
            df['POS'] = new_positions
            df['phenotype_id'] = new_phenotype_ids

            
            df.to_csv(file_path, index=False)
            print(f" {file_path}")


directory = 'small_expecto'  

process_csv_files(directory)


处理文件: small_expecto/train_small_Breast_Mammary_Tissue.csv
文件已保存: small_expecto/train_small_Breast_Mammary_Tissue.csv
处理文件: small_expecto/train_small_Artery_Tibial.csv
文件已保存: small_expecto/train_small_Artery_Tibial.csv
处理文件: small_expecto/train_small_Thyroid.csv
文件已保存: small_expecto/train_small_Thyroid.csv
处理文件: small_expecto/train_small_Adipose_Subcutaneous.csv
文件已保存: small_expecto/train_small_Adipose_Subcutaneous.csv
处理文件: small_expecto/test_small_Artery_Tibial.csv
文件已保存: small_expecto/test_small_Artery_Tibial.csv
处理文件: small_expecto/test_small_Colon_Transverse.csv
文件已保存: small_expecto/test_small_Colon_Transverse.csv
处理文件: small_expecto/test_small_Adipose_Subcutaneous.csv
文件已保存: small_expecto/test_small_Adipose_Subcutaneous.csv
处理文件: small_expecto/test_small_Thyroid.csv
文件已保存: small_expecto/test_small_Thyroid.csv
处理文件: small_expecto/train_small_Nerve_Tibial.csv
文件已保存: small_expecto/train_small_Nerve_Tibial.csv
处理文件: small_expecto/test_small_Nerve_Tibial.csv
文件已保存: small_expecto/test_s

In [15]:
import os
import pandas as pd


def convert_csv_to_vcf(csv_file_path, output_dir):
    
    df = pd.read_csv(csv_file_path)
    df['POS'] = df['POS'].fillna(0).astype(int)  

  
    vcf_data = []

    for index, row in df.iterrows():
        chrom = row['CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        
       
        vcf_line = f"{chrom}\t{pos}\t-\t{ref}\t{alt}"
        vcf_data.append(vcf_line)

 
    vcf_content = "\n".join(vcf_data)
    
    
    file_name = os.path.splitext(os.path.basename(csv_file_path))[0]
    
   
    output_vcf_path = os.path.join(output_dir, f"{file_name}.vcf")
    
    
    with open(output_vcf_path, 'w') as vcf_file:
        vcf_file.write(vcf_content)

    print(f" {output_vcf_path}")


def process_folder(input_folder):
    
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            csv_file_path = os.path.join(input_folder, filename)
            print(f"{csv_file_path}")
            
            # 调用转换函数处理每个文件
            convert_csv_to_vcf(csv_file_path, input_folder)


folder_path = 'small_expecto'  


process_folder(folder_path)


正在处理文件: small_expecto/train_small_Breast_Mammary_Tissue.csv
VCF 文件已保存: small_expecto/train_small_Breast_Mammary_Tissue.vcf
正在处理文件: small_expecto/train_small_Artery_Tibial.csv
VCF 文件已保存: small_expecto/train_small_Artery_Tibial.vcf
正在处理文件: small_expecto/train_small_Thyroid.csv
VCF 文件已保存: small_expecto/train_small_Thyroid.vcf
正在处理文件: small_expecto/train_small_Adipose_Subcutaneous.csv
VCF 文件已保存: small_expecto/train_small_Adipose_Subcutaneous.vcf
正在处理文件: small_expecto/test_small_Artery_Tibial.csv
VCF 文件已保存: small_expecto/test_small_Artery_Tibial.vcf
正在处理文件: small_expecto/test_small_Colon_Transverse.csv
VCF 文件已保存: small_expecto/test_small_Colon_Transverse.vcf
正在处理文件: small_expecto/test_small_Adipose_Subcutaneous.csv
VCF 文件已保存: small_expecto/test_small_Adipose_Subcutaneous.vcf
正在处理文件: small_expecto/test_small_Thyroid.csv
VCF 文件已保存: small_expecto/test_small_Thyroid.vcf
正在处理文件: small_expecto/train_small_Nerve_Tibial.csv
VCF 文件已保存: small_expecto/train_small_Nerve_Tibial.vcf
正在处理文件: small_expecto

In [19]:
import os
import pandas as pd


def convert_csv_to_closestgene(csv_file_path, output_dir):
   
    df = pd.read_csv(csv_file_path)
    
    df['POS'] = df['POS'].fillna(0).astype(int)
    df['tss_distance'] = df['tss_distance'].fillna(0).astype(int)

    
   
    closestgene_data = []

    
    for index, row in df.iterrows():
        chrom = row['CHROM'].replace("chr", "")  
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        tss_distance = row['tss_distance']
        phenotype_id = row['phenotype_id']
        
       
        pos_minus_1 = pos - 1
        pos_plus_tss_minus_1 = (pos + tss_distance) - 1
        pos_plus_tss = pos + tss_distance
        
       
        tss_sign = '+' if tss_distance >= 0 else '-'
        
        
        closestgene_line = f"{chrom}\t{pos_minus_1}\t{pos}\t{ref}\t{alt}\t{chrom}\t{pos_plus_tss_minus_1}\t{pos_plus_tss}\t{tss_sign}\t{phenotype_id}\t{tss_distance}"
        closestgene_data.append(closestgene_line)

    
    closestgene_content = "\n".join(closestgene_data)
    
    
    file_name = os.path.splitext(os.path.basename(csv_file_path))[0]
    
    
    output_closestgene_path = os.path.join(output_dir, f"{file_name}.closestgene")
    
    
    with open(output_closestgene_path, 'w') as f:
        f.write(closestgene_content)

    print(f".closestgene : {output_closestgene_path}")


def process_folder(input_folder, output_folder):

    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            csv_file_path = os.path.join(input_folder, filename)
            print(f" {csv_file_path}")
            
         
            convert_csv_to_closestgene(csv_file_path, output_folder)


input_folder = 'small_expecto'  
output_folder = 'small_expecto'  

process_folder(input_folder, output_folder)


正在处理文件: small_expecto/train_small_Breast_Mammary_Tissue.csv
.closestgene 文件已保存: small_expecto/train_small_Breast_Mammary_Tissue.closestgene
正在处理文件: small_expecto/train_small_Artery_Tibial.csv
.closestgene 文件已保存: small_expecto/train_small_Artery_Tibial.closestgene
正在处理文件: small_expecto/train_small_Thyroid.csv
.closestgene 文件已保存: small_expecto/train_small_Thyroid.closestgene
正在处理文件: small_expecto/train_small_Adipose_Subcutaneous.csv
.closestgene 文件已保存: small_expecto/train_small_Adipose_Subcutaneous.closestgene
正在处理文件: small_expecto/test_small_Artery_Tibial.csv
.closestgene 文件已保存: small_expecto/test_small_Artery_Tibial.closestgene
正在处理文件: small_expecto/test_small_Colon_Transverse.csv
.closestgene 文件已保存: small_expecto/test_small_Colon_Transverse.closestgene
正在处理文件: small_expecto/test_small_Adipose_Subcutaneous.csv
.closestgene 文件已保存: small_expecto/test_small_Adipose_Subcutaneous.closestgene
正在处理文件: small_expecto/test_small_Thyroid.csv
.closestgene 文件已保存: small_expecto/test_small_Thyroid.cl