In [1]:
import os
import pandas as pd

In [2]:
def get_stat(seqFilepath, key):
    """
    given fastqFilepath, search stat file and return value correspond to key
    """
    
    statFilepath=seqFilepath.split('.fastq')[0] + ".stat"
    if not(os.path.isfile(statFilepath)):
        print("ERROR: stat file not exists, {}".format(statFilepath))
        return None
    stat_df=pd.read_csv(statFilepath, delimiter='\t')
    if not(key in stat_df.columns):
        print("ERROR: key not exists, {}".format(key))
        return None
    return stat_df.loc[0, key]

In [3]:
fastqFilepath="filepath_dna.csv"
fastq_df=pd.read_csv(fastqFilepath)
fastq_df.head()

Unnamed: 0,fastq_id,sample_id,direction,raw,google,sqc
0,MFC1_06m_anode_felt_1_R1,MFC1_06m_anode_felt_1,R1,/work/GoryaninU/mitsuki/mizuho/dna/raw/MFC1_06...,/work/GoryaninU/mitsuki/mizuho/dna/google/6mon...,/work/sqc_data/GoryaninU/150225_M02343_0059_00...
1,MFC1_06m_anode_felt_1_R2,MFC1_06m_anode_felt_1,R2,/work/GoryaninU/mitsuki/mizuho/dna/raw/MFC1_06...,/work/GoryaninU/mitsuki/mizuho/dna/google/6mon...,/work/sqc_data/GoryaninU/150225_M02343_0059_00...
2,MFC1_06m_anode_felt_2_R1,MFC1_06m_anode_felt_2,R1,/work/GoryaninU/mitsuki/mizuho/dna/raw/MFC1_06...,/work/GoryaninU/mitsuki/mizuho/dna/google/6mon...,/work/sqc_data/GoryaninU/140627_M02137_0045_00...
3,MFC1_06m_anode_felt_2_R2,MFC1_06m_anode_felt_2,R2,/work/GoryaninU/mitsuki/mizuho/dna/raw/MFC1_06...,/work/GoryaninU/mitsuki/mizuho/dna/google/6mon...,/work/sqc_data/GoryaninU/140627_M02137_0045_00...
4,MFC1_06m_anode_granules_1_R1,MFC1_06m_anode_granules_1,R1,/work/GoryaninU/mitsuki/mizuho/dna/raw/MFC1_06...,/work/GoryaninU/mitsuki/mizuho/dna/google/6mon...,/work/sqc_data/GoryaninU/150225_M02343_0059_00...


In [11]:
fastqId_lst=[]
for metaFilepath in ("mizuho_metadata_dna.csv", "re_mizuho_metadata_dna.csv"):
    meta_df=pd.read_csv(metaFilepath)
    fastqId_lst+=list(meta_df["fastq_id"])
print(len(fastqId_lst))

118


In [5]:
sampleId_lst=["MFC2_reference_1", "MFC2_reference_2", "MFC3_reference"]
fastqId_lst=[]
for sampleId in sampleId_lst:
    fastqId_lst.append(sampleId+"_R1")
    fastqId_lst.append(sampleId+"_R2")

In [6]:
fastqId_lst

['MFC2_reference_1_R1',
 'MFC2_reference_1_R2',
 'MFC2_reference_2_R1',
 'MFC2_reference_2_R2',
 'MFC3_reference_R1',
 'MFC3_reference_R2']

In [7]:
dct_lst=[]
baseDirec="/work/GoryaninU/mitsuki/mizuho/dna"
for fastqId in fastqId_lst:
    print("START: {}".format(fastqId))
    step_lst=["row", "trim", "correct", "filter"]
    fp_lst=[]
    fp_lst.append("{}/row/{}.fastq.gz".format(baseDirec, fastqId))
    fp_lst.append("{}/trim/{}.fastq".format(baseDirec, fastqId))
    fp_lst.append("{}/filter/{}.tmp.fastq".format(baseDirec, fastqId))
    fp_lst.append("{}/filter/{}.fastq".format(baseDirec, fastqId))
    assert len(step_lst)==len(fp_lst)
    
    dct={"fastq_id": fastqId}
    for step, fp in zip(step_lst, fp_lst):
        dct[step]=get_stat(fp, "num_seqs")
    dct_lst.append(dct)

START: MFC2_reference_1_R1
ERROR: stat file not exists, /work/GoryaninU/mitsuki/mizuho/dna/row/MFC2_reference_1_R1.stat
START: MFC2_reference_1_R2
ERROR: stat file not exists, /work/GoryaninU/mitsuki/mizuho/dna/row/MFC2_reference_1_R2.stat
START: MFC2_reference_2_R1
ERROR: stat file not exists, /work/GoryaninU/mitsuki/mizuho/dna/row/MFC2_reference_2_R1.stat
START: MFC2_reference_2_R2
ERROR: stat file not exists, /work/GoryaninU/mitsuki/mizuho/dna/row/MFC2_reference_2_R2.stat
START: MFC3_reference_R1
ERROR: stat file not exists, /work/GoryaninU/mitsuki/mizuho/dna/row/MFC3_reference_R1.stat
START: MFC3_reference_R2
ERROR: stat file not exists, /work/GoryaninU/mitsuki/mizuho/dna/row/MFC3_reference_R2.stat


In [9]:
out_df=pd.DataFrame(dct_lst)
out_df=out_df[["fastq_id", "row", "trim", "correct", "filter"]]
out_df["per"]=out_df["filter"]/out_df["row"]*100
out_df

Unnamed: 0,fastq_id,row,trim,correct,filter,per
0,MFC2_reference_1_R1,,6094565,4850101,4688031,
1,MFC2_reference_1_R2,,6094565,5061507,4688031,
2,MFC2_reference_2_R1,,1915273,1215544,1145179,
3,MFC2_reference_2_R2,,1915273,1337857,1145179,
4,MFC3_reference_R1,,1283219,777481,703553,
5,MFC3_reference_R2,,1283219,811044,703553,


In [22]:
out_df["per"].mean()

55.886315613837759

In [6]:
out_df.to_csv("./out/num_seqs.csv", index=False)

In [None]:
seqFilepath="/work/GoryaninU/mitsuki/mizuho/dna/row/MFC2_36m_channel8_R2.fastq.gz"
get_num_seqs(seqFilepath)