In [3]:
import pandas as pd 
import glob
import os 
import re

In [None]:
def list2xlsx(dict_of_dfs,xlsx_file_name,index=True):
    with pd.ExcelWriter(xlsx_file_name, engine='xlsxwriter') as writer:
    # Loop through the data frames in the dictionary
        for sheet_name, df in dict_of_dfs.items():
            # Write each data frame to a different sheet in the Excel file
            df.to_excel(writer, sheet_name=sheet_name, index=index)

def mergeExpressionFile(input_meta,outfile =None):
    res_dict = {}
    res_dict['counts'] = []
    res_dict['FPKM'] = []
    res_dict['TPM'] = []
    '''
    input_mRNA_dir contains directores of step02_align and step03_gene_abundance,
    In this script, the function return the expression matrix of gene expression count and TPM/FPKM/...
    '''
    df = pd.read_csv(input_meta,index_col=0)
    # df = df.head()
    for each_sample in df.index:
        expr_file = df.loc[each_sample,'gene_expression']
        count_file = df.loc[each_sample,'counts']
        # read expression data
        df_expr = pd.read_table(expr_file)
        df_count = pd.read_table(count_file,header=None)
        df_count.columns = ['Gene ID','unstrand','First',"Second"]
        df_count = df_count.drop_duplicates(subset='Gene ID',keep='first').set_index("Gene ID")
        #df_count = df_count.head(100)
        df_count = df_count[~df_count.index.str.startswith("N_")]

        # remove duplicated gene id 
        df_expr = df_expr.drop_duplicates(subset='Gene ID',keep='first').set_index(["Gene ID","Gene Name"])
        
        # 1) get count
        df_count = df_count.loc[:,["unstrand"]]
        df_count.columns = [each_sample]
        res_dict['counts'].append(df_count)

        # 2) get FPKM
        df_fpkm = df_expr.loc[:,["FPKM"]]
        df_fpkm.columns = [each_sample]
        res_dict['FPKM'].append(df_fpkm)

        # 3) get TPM 
        df_tpm = df_expr.loc[:,["TPM"]]
        df_tpm.columns = [each_sample]
        res_dict['TPM'].append(df_tpm)

    res_dict['counts'] = pd.concat(res_dict['counts'],axis=1)
    res_dict['FPKM'] = pd.concat(res_dict['FPKM'],axis=1)
    res_dict['TPM'] = pd.concat(res_dict['TPM'],axis=1)
    if isinstance(outfile,str):
        print(f"The xlsx file supplied: {outfile}!")
        list2xlsx(dict_of_dfs = res_dict,xlsx_file_name=outfile)
    return res_dict 

In [20]:
def prepareSample(input_dir):
    all_files = glob.glob(input_dir + "/**/*fastq.gz")
    df = pd.DataFrame({
        'file': all_files
    })
    df['reads'] = df['file'].str.extract("(R1|R2)")
    df['sample'] = df['file'].apply(os.path.basename).str.extract(r"(HRR\d+)")
    df = df.pivot(index='sample',columns='reads',values='file')
    return df
    


In [None]:
df = prepareSample('/home/keke/base/raw/HRA003647')

df.to_csv('sample.csv')

In [None]:
expr_file = mergeExpressionFile('/home/sunqiangqiang/data/project/immune/PRJNA482620/results/mRNA/meta.csv','PRJNA482620_expr.xlsx')