In [1]:
import sqlite3
import glob
import pandas as pd
import traceback

In [225]:
def extract_info_from_log(log_file, sample_name, run0420 = None):
    """
    Extracts information from a ShapeMapper log file.

    Parameters:
        log_file (str): Path to the ShapeMapper log file.
        sample_name (str): Name of the sample to check against the R1 file.

    Returns:
        tuple: Contains the following elements:
            - run_datetime (str): The datetime when the ShapeMapper run started.
            - version (str): The version of ShapeMapper used.
            - r1_file (str): The R1 file used in the run.
            - untreated (int): Indicates if the sample was untreated (1 if untreated, 0 otherwise).
            - denatured (int): Indicates if the sample was denatured (1 if denatured, 0 otherwise).
            - sample_check (bool): Indicates if the sample name matches the R1 file.
    """

    with open(log_file) as f:
        lines = f.readlines()

    # find all lines containing "Started ShapeMapper" and get index of most recent one
    detect_shapemapper_runs = [i for i, line in enumerate(lines) if 'Started ShapeMapper' in line]
    assert len(detect_shapemapper_runs) > 0, 'No ShapeMapper runs detected in log file'

    most_recent_run = detect_shapemapper_runs[-1]
    lines = lines[most_recent_run:]

    # check shapemapper success
    run_completed = [i for i, line in enumerate(lines) if ('ShapeMapper run completed' in line) or ('ShapeMapper run successfully completed' in line)]
    if len(run_completed) == 0:
        print(f'ShapeMapper run not completed successfully in log file: {log_file}')
        return None

    #print(log_file)
    # extract date and version from:  "Started ShapeMapper v2.2.0 at 2023-04-22 17:19:59"
    version_date_line = lines[0]
    run_datetime = version_date_line.split(' at ')[1].rstrip()
    version = version_date_line.split(' ')[2]
    run_args = lines[2]
    assert 'args: ' in run_args, 'args line not found in log file'
    
    # get index of 'modified'
    modified_index = run_args.split(' --').index('modified')
    assert modified_index > 0, 'modified not found in run_args'

    # extract R1 file
    r1_file = run_args.split(' --')[modified_index + 1].split(' ')[-1]
    assert (r1_file is not None) or (r1_file == ''), 'R1 file not found in run_args'

    untreated = 0
    denatured = 0
    # check if untreated sample provided
    if 'untreated' in run_args.split(' --'):
        untreated_index = run_args.split(' --').index('untreated')
        untreated_r1_file = run_args.split(' --')[untreated_index + 1].split(' ')[-1]
        assert (untreated_r1_file is not None) or (untreated_r1_file == ''), 'R1 file not found in run_args'
        untreated = untreated_r1_file
    elif 'denatured' in run_args.split(' --'):
        denatured_index = run_args.split(' --').index('denatured')
        den_r1_file = run_args.split(' --')[denatured_index + 1].split(' ')[-1]
        assert (den_r1_file is not None) or (den_r1_file == ''), 'R1 file not found in run_args'
        denatured = den_r1_file

    # confirm sample_name matches r1_file
    # remove .fastq.gz from both if they exist
    if len(r1_file.split('/')) > 2:
        r1_file = r1_file.split('/')[-1]
        r1_file_check = r1_file.replace('...', '')
        sample_name_check = sample_name[:len(r1_file_check)]
        sample_check = (sample_name_check == r1_file_check)
        return run_datetime, run_args, version, r1_file, untreated, denatured, sample_check
    if r1_file.endswith('.fastq.gz'):
        r1_file_check = r1_file[:-9]
    elif r1_file.endswith('.fastq'):
        r1_file_check = r1_file[:-6]
    else:
        r1_file_check = r1_file
    if sample_name.endswith('.fastq.gz'):
        sample_name_check = sample_name[:-9]
    elif sample_name.endswith('.fastq'):
        sample_name_check = sample_name[:-6]
    else:
        sample_name_check = sample_name
    if r1_file.startswith('./'):
        r1_file_check = r1_file_check[2:]
    if sample_name_check.startswith('YYYR'):
        r1_file_check = r1_file_check[5:]
        sample_name_check = sample_name_check[5:]
    elif sample_name_check.startswith('etOH'):
        #DMS-150-WTII_S8_L001_R1_001 etOH-150-WTII_S16_L001_R1_001
        r1_file_check = '-'.join(r1_file_check.split('_')[0].split('-')[1:])
        sample_name_check = '-'.join(sample_name_check.split('_')[0].split('-')[1:])

    sample_check = (sample_name_check == r1_file_check)

    # override mistaken r1 file name
    if 'WT-33c-b-6' in r1_file_check:
        sample_check = True

    if sample_check == False:
        # try removing all underscores and compare again
        r1_file_check = r1_file_check.replace('_', '')
        sample_name_check = sample_name_check.replace('_', '')
        sample_check = (sample_name_check == r1_file_check)
        if sample_check == False:
            print(r1_file_check, sample_name_check)

    if run0420:
        name_index = run_args.split(' --')[1]
        assert 'name' in name_index, 'name not found in run_args'
        name_check = name_index.split(' ')[-1]
        sample_check = name_check in sample_name
        if sample_check:
            print('Rechecking on name successful')
        else:
            print(r1_file_check, sample_name_check)

    return run_datetime, run_args, version, r1_file, untreated, denatured, sample_check

def fetch_s_id(db_file, sample_name):
    """
    Fetches the ID of a sample from the sequencing_samples table.

    Parameters:
        db_file (str): Path to the database file.
        sample_name (str): Name of the sample to fetch the ID for.

    Returns:
        int: The ID of the sample.
    """
    
    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    c.execute('SELECT id FROM sequencing_samples WHERE sample_name = ?', (sample_name,))
    result = c.fetchall()  # Fetch only one row
    conn.close()

    if result is None:
        raise ValueError(f"No sample found with name: {sample_name}")
    elif len(result) > 1:
        raise ValueError(f"Multiple samples found with name: {sample_name}")
    else:
        return result[0][0]  # Extract ID from tuple

def get_max_id(db_file, table, id_col):
    """
        Fetches the maximum ID from a specified table and column.

        Parameters:
            db_file (str): Path to the database file.
            table (str): Name of the table to query.
            id_col (str): Name of the ID column to find the maximum value.

        Returns:
            int: The maximum ID value plus one, or 1 if the table is empty.
        """
    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    c.execute(f"SELECT MAX({id_col}) FROM {table}")
    max_id = c.fetchone()[0]
    return max_id + 1 if max_id else 1

def fetch_construct_seq(db_file, s_id):
    """
    Fetches the construct sequence for a given sample ID.

    Parameters:
        db_file (str): Path to the database file.
        s_id (int): Sample ID to fetch the construct sequence for.

    Returns:
        str: The construct sequence with T's converted to U's.
    """

    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    c.execute('SELECT construct_id FROM probing_reactions WHERE s_id = ?', (s_id,))
    construct_id = c.fetchone()[0]
    c.execute('SELECT sequence FROM constructs WHERE id = ?', (construct_id,))
    construct_seq = c.fetchone()[0]
    dict_convertTU = {'T': 'U', 't': 'u'}
    construct_seq = ''.join([dict_convertTU.get(base, base) for base in construct_seq])
    conn.close()
    return construct_seq

def fetch_rxn_id(db_file, s_id):
    """
    Fetches the reaction ID for a given sample ID.

    Parameters:
        db_file (str): Path to the database file.
        s_id (int): Sample ID to fetch the reaction ID for.

    Returns:
        int: The reaction ID.
    """

    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    c.execute('SELECT id, treated FROM probing_reactions WHERE s_id = ?', (s_id,))
    result = c.fetchone()
    conn.close()
    
    rxn_id = result[0]
    treated = result[1]
    return rxn_id, treated

def fetch_nt_ids(db_file, s_id):
    """
    Fetches the nucleotide IDs and sequence for a given sample ID.

    Parameters:
        db_file (str): Path to the database file.
        s_id (int): Sample ID to fetch the nucleotide IDs and sequence for.

    Returns:
        tuple: A tuple containing a list of nucleotide IDs and the nucleotide sequence with T's converted to U's.
    """
    
    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    c.execute('SELECT construct_id FROM probing_reactions WHERE s_id = ?', (s_id,))
    construct_id = c.fetchone()[0]
    c.execute('SELECT id, base FROM nucleotides WHERE construct_id = ?', (construct_id,))
    selected_nts = sorted(c.fetchall())
    conn.close()

    nt_ids = [nt[0] for nt in selected_nts]
    nt_seq = ''.join([nt[1] for nt in selected_nts])
    dict_convertTU = {'T': 'U', 't': 'u'}
    nt_seq = ''.join([dict_convertTU.get(base, base) for base in nt_seq])
    return nt_ids, nt_seq


In [226]:
## Construct fmod_calc_run entry

def construct_fmod_calc_run(sample_name, fmod_dir, db_file, run0420 = None):

    run = glob.glob(f'/projects/b1044/Computational_Output/EKC/{fmod_dir}/*shapemapper_log*')[0]
    run_datetime, run_args, version, r1_file, untreated, denatured, sample_check = extract_info_from_log(run, sample_name, run0420)
    s_id = fetch_s_id(db_file, sample_name)

    # get potential fmod_calc id but do not add until fmod vals are good

    # Tentative fmod_calc id (pending fmod_vals check)
    fmod_calc_id = get_max_id(db_file, 'fmod_calc_runs', 'id')

    profile_txt = glob.glob(f'/projects/b1044/Computational_Output/EKC/{fmod_dir}/**/*_profile.txt', recursive=True)
    # exclude shapemapper_temp
    profile_txt = [x for x in profile_txt if 'shapemapper_temp' not in x]
    # choose profile with "reanalyzed"
    if len(profile_txt) > 1:
        profile_txt = [x for x in profile_txt if 'reanalyzed' in x]
        #print(fmod_dir, profile_txt)
    assert len(profile_txt) == 1, 'Multiple or no profile.txt files found'
    profile_txt = profile_txt[0]
    
    
    # process GAmodrate
    profile_txtga = glob.glob(f'/projects/b1044/Computational_Output/EKC/{fmod_dir}/**/*_profile.txtga', recursive=True)

    # exclude shapemapper_temp
    profile_txtga = [x for x in profile_txtga if 'shapemapper_temp' not in x]
    # choose profile with "reanalyzed"
    if len(profile_txtga) > 1:
        profile_txtga = [x for x in profile_txtga if 'reanalyzed' in x]
        #print(fmod_dir, profile_txt)
    elif len(profile_txtga) == 0:
        profile_txtga = None
    else:
        profile_txtga = profile_txtga[0]

    # handle untreated or denatured
    rxn_id, rxn_treated = fetch_rxn_id(db_file, s_id)

    use_untreated_calc = False

    if (untreated != 0) & (rxn_treated == 0):
        r1_file = untreated
        use_untreated_calc = True
    elif (denatured != 0) & (rxn_treated == 1):
        r1_file = denatured

    return run_datetime, run_args, version, use_untreated_calc, r1_file, sample_check, s_id, fmod_calc_id, profile_txt, profile_txtga

def construct_fmod_vals(profile_txt, db_file, s_id, fmod_calc_id, use_untreated_calc):
    # read the csv file
    df = pd.read_csv(profile_txt, sep='\t')
    seq_from_profile = ''.join(df['Sequence'].values)

    construct_seq = fetch_construct_seq(db_file, s_id)
    assert construct_seq.upper() == seq_from_profile.upper(), 'Construct sequence does not match profile.txt sequence'

    nt_ids, nt_seq = fetch_nt_ids(db_file, s_id)
    assert nt_seq.upper() == seq_from_profile.upper(), 'Nt sequence does not match profile.txt sequence'

    rxn_id, rxn_treated = fetch_rxn_id(db_file, s_id)

    if use_untreated_calc:
        #print('using untreated')
        fmod_vals = df['Untreated_rate'].values
        read_depths = df['Untreated_read_depth'].values
    else:
        fmod_vals = df['Modified_rate'].values
        read_depths = df['Modified_read_depth'].values

    fmod_vals_df = pd.DataFrame({'nt_id': nt_ids, 'fmod_calc_run_id': fmod_calc_id, 'fmod_val': fmod_vals, 'valtype': 'modrate', 'read_depth': read_depths, 'rxn_id': rxn_id})
    return fmod_vals_df

In [227]:
samples = pd.read_csv('/projects/b1044/Computational_Output/EKC/EKC.01_SHAPE_standardization/EKC.01.060.developing_DB_input/samples_import.csv')

skipped = []

i = 112
# check if sample_name column has not repeats
fmod_dir = samples[samples['RT'] == 'MRT']['fmod_runs'].values[i]
sample_name = samples[samples['RT'] == 'MRT']['sample_name'].values[i]

i = 5
# check if sample_name column has not repeats
fmod_dir = samples[samples['sequencing_run'] == 23]['fmod_runs'].values[i]
sample_name = samples[samples['sequencing_run'] == 23]['sample_name'].values[i]

i = 5
# check if sample_name column has not repeats
fmod_dir = samples[samples['RT'] == 'MRTpH9']['fmod_runs'].values[i]
sample_name = samples[samples['RT'] == 'MRTpH9']['sample_name'].values[i]


i = 5
# check if sample_name column has not repeats
fmod_dir = samples[samples['done_by'] == 'RB']['fmod_runs'].values[i]
sample_name = samples[samples['done_by'] == 'RB']['sample_name'].values[i]

# get text inside single quote '
print(fmod_dir)
if "'" in fmod_dir:
    fmod_dir = fmod_dir.split("'")[1]
print(fmod_dir)
print(sample_name)
db_file = '/projects/b1044/Computational_Output/EKC/EKC.01_SHAPE_standardization/EKC.01.060.developing_DB_input/new.db'

try:
    run_datetime, run_args, version, use_untreated_calc, r1_file, sample_check, s_id, fmod_calc_id, profile_txt, profile_txtga = construct_fmod_calc_run(sample_name, fmod_dir, db_file, True)
except:
    print('Error in log file, skipping...')
if sample_check:
    fmod_vals_df = construct_fmod_vals(profile_txt, db_file, s_id, fmod_calc_id, use_untreated_calc)
    
    if profile_txtga is not None:
        fmod_vals_df_ga = construct_fmod_vals(profile_txtga, db_file, s_id, fmod_calc_id, use_untreated_calc)
        fmod_vals_df_ga['valtype'] = 'GAmodrate'
        fmod_vals_df = pd.concat([fmod_vals_df, fmod_vals_df_ga])
else:
    print('Sample name does not match R1 file, skipping...')
    skipped.append(i)

# Append fmod_calc_run fmod_vals to db
# conn = sqlite3.connect(db_file)
# c = conn.cursor()
# c.execute('INSERT INTO fmod_calc_runs (id, s_id, run_datetime, version, r1_file) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', (fmod_calc_id, s_id, run_datetime, version, r1_file))
# fmod_vals_df.to_sql('fmod_vals', conn, if_exists='append', index=False)
# conn.commit()
# conn.close()

fmod_vals_df

fmod_calc_runs_2/000000000260
fmod_calc_runs_2/000000000260
DMS-40-WTII_S4_L001_R1_001.fastq.gz
DMS-40-WTII_S4_L001_R1_001 DMS-40-WTII_S4_L001_R1_001
Sample name does not match R1 file, skipping...


Unnamed: 0,nt_id,fmod_calc_run_id,fmod_val,valtype,read_depth,rxn_id
0,531,786,,modrate,58138,1101
1,532,786,,modrate,58192,1101
2,533,786,,modrate,58219,1101
3,534,786,,modrate,58228,1101
4,535,786,,modrate,58232,1101
...,...,...,...,...,...,...
207,738,786,,GAmodrate,58639,1101
208,739,786,,GAmodrate,58635,1101
209,740,786,,GAmodrate,58625,1101
210,741,786,,GAmodrate,58607,1101


In [212]:
mrt_only = samples[(samples['RT'] == 'MRT') | (samples['RT'] == 'MRTpH9')]
# drop nan values 
mrt_only = mrt_only.dropna(subset=['fmod_runs'])

for i, row in mrt_only.iterrows():
    fmod_dir = row['fmod_runs']
    sample_name = row['sample_name']

    # get text inside single quote '
    if "'" in fmod_dir:
        fmod_dir = fmod_dir.split("'")[1]
    #print(fmod_dir)
    #print(sample_name)
    if row['sequencing_run'] == 23:
        run0420 = True
    else:
        run0420 = None
    try:
        run_datetime, run_args, version, use_untreated_calc, r1_file, sample_check, s_id, fmod_calc_id, profile_txt, profile_txtga = construct_fmod_calc_run(sample_name, fmod_dir, db_file, True)
    except:
        print('---------------------------------------------')
        print(sample_name)
        traceback.print_exc()
        print('---------------------------------------------')
        continue

    if sample_check:
        fmod_vals_df = construct_fmod_vals(profile_txt, db_file, s_id, fmod_calc_id, use_untreated_calc)
        if profile_txtga is not None:
            fmod_vals_df_ga = construct_fmod_vals(profile_txtga, db_file, s_id, fmod_calc_id, use_untreated_calc)
            fmod_vals_df_ga['valtype'] = 'GAmodrate'
            fmod_vals_df = pd.concat([fmod_vals_df, fmod_vals_df_ga])
        
        # Append fmod_calc_run fmod_vals to db
        #conn = sqlite3.connect(db_file)
        #c = conn.cursor()
        #c.execute('INSERT INTO fmod_calc_runs (id, s_id, software_name, software_version, run_args, output_dir) VALUES (?, ?, ?, ?, ?, ?)', (fmod_calc_id, s_id, 'shapemapper', version, run_args, r1_file))
        #fmod_vals_df.to_sql('fmod_vals', conn, if_exists='append', index=False)
        #conn.commit()
        #conn.close()
        
        # Processed done, drop row
        mrt_only = mrt_only.drop(index=i)
    else:
        print(fmod_dir, r1_file, sample_name)
        print('Sample name does not match R1 file, skipping...')
        continue
    
mrt_only

RRRY-017-EKC-fourUnew-WT-II-37C-dms-MaP-0-30_S17_L001_R1_001 RRRY-017-EKC-fourUnew-WT-II-37C-dms-MaP-0-30_S17_L001_R1_001
017-EKC-fourUnew-WT-II-37C-dms-MaP-0-30_S17_L001_R1_001 017-EKC-fourUnew-WT-II-37C-dms-MaP-0-30_S17_L001_R1_001
RRRY-018-EKC-fourUnew-WT-II-37C-dms-MaP-1_S18_L001_R1_001 RRRY-018-EKC-fourUnew-WT-II-37C-dms-MaP-1_S18_L001_R1_001
018-EKC-fourUnew-WT-II-37C-dms-MaP-1_S18_L001_R1_001 018-EKC-fourUnew-WT-II-37C-dms-MaP-1_S18_L001_R1_001
RRRY-019-EKC-fourUnew-WT-II-37C-dms-MaP-20_S19_L001_R1_001 RRRY-019-EKC-fourUnew-WT-II-37C-dms-MaP-20_S19_L001_R1_001
019-EKC-fourUnew-WT-II-37C-dms-MaP-20_S19_L001_R1_001 019-EKC-fourUnew-WT-II-37C-dms-MaP-20_S19_L001_R1_001
RRRY-020-EKC-fourUnew-WT-II-37C-dms-MaP-40_S20_L001_R1_001 RRRY-020-EKC-fourUnew-WT-II-37C-dms-MaP-40_S20_L001_R1_001
020-EKC-fourUnew-WT-II-37C-dms-MaP-40_S20_L001_R1_001 020-EKC-fourUnew-WT-II-37C-dms-MaP-40_S20_L001_R1_001
RRRY-021-EKC-fourUnew-WT-II-37C-dms-MaP-60_S21_L001_R1_001 RRRY-021-EKC-fourUnew-WT-II-37C-d

001-EKC-fourUnew-WT-25C-dms-MaP-6_S1_L001_R1_001 001-EKC-fourUnew-WT-25C-dms-MaP-6_S1_L001_R1_001
RRRY-002-EKC-fourUnew-WT-25C-dms-MaP-25_S2_L001_R1_001 RRRY-002-EKC-fourUnew-WT-25C-dms-MaP-25_S2_L001_R1_001
002-EKC-fourUnew-WT-25C-dms-MaP-25_S2_L001_R1_001 002-EKC-fourUnew-WT-25C-dms-MaP-25_S2_L001_R1_001
RRRY-003-EKC-fourUnew-WT-25C-dms-MaP-60_S3_L001_R1_001 RRRY-003-EKC-fourUnew-WT-25C-dms-MaP-60_S3_L001_R1_001
003-EKC-fourUnew-WT-25C-dms-MaP-60_S3_L001_R1_001 003-EKC-fourUnew-WT-25C-dms-MaP-60_S3_L001_R1_001
RRRY-004-EKC-fourUnew-WT-25C-dms-MaP-120_S4_L001_R1_001 RRRY-004-EKC-fourUnew-WT-25C-dms-MaP-120_S4_L001_R1_001
004-EKC-fourUnew-WT-25C-dms-MaP-120_S4_L001_R1_001 004-EKC-fourUnew-WT-25C-dms-MaP-120_S4_L001_R1_001
RRRY-005-EKC-fourUnew-WT-25C-dms-MaP-180_S5_L001_R1_001 RRRY-005-EKC-fourUnew-WT-25C-dms-MaP-180_S5_L001_R1_001
005-EKC-fourUnew-WT-25C-dms-MaP-180_S5_L001_R1_001 005-EKC-fourUnew-WT-25C-dms-MaP-180_S5_L001_R1_001
RRRY-006-EKC-fourUnew-WT-25C-dms-MaP-240_S6_L001_R1_00

RRRY-017-EKC-fourUnew-WT-80C-dms-MaP-5S17L001R1001 015-EKC-fourUnew-WT-80C-dms-MaP-3S15L001R1001
RRRY-017-EKC-fourUnew-WT-80C-dms-MaP-5S17L001R1001 015-EKC-fourUnew-WT-80C-dms-MaP-3S15L001R1001
RRRYYYYR-013-EKC-fourUnew-WT-80C-dms-MaP-1S13L001R1001 RRRY016-EKC-fourUnew-WT-80C-dms-MaP-4S16L001R1001
RRRYYYYR-013-EKC-fourUnew-WT-80C-dms-MaP-1S13L001R1001 RRRY016-EKC-fourUnew-WT-80C-dms-MaP-4S16L001R1001
YYYR-013-EKC-fourUnew-WT-80C-dms-MaP-1S13L001R1001 016-EKC-fourUnew-WT-80C-dms-MaP-4S16L001R1001
YYYR-013-EKC-fourUnew-WT-80C-dms-MaP-1S13L001R1001 016-EKC-fourUnew-WT-80C-dms-MaP-4S16L001R1001
RRRYYYYR-015-EKC-fourUnew-WT-80C-dms-MaP-3S15L001R1001 RRRY017-EKC-fourUnew-WT-80C-dms-MaP-5S17L001R1001
RRRYYYYR-015-EKC-fourUnew-WT-80C-dms-MaP-3S15L001R1001 RRRY017-EKC-fourUnew-WT-80C-dms-MaP-5S17L001R1001
YYYR-015-EKC-fourUnew-WT-80C-dms-MaP-3S15L001R1001 017-EKC-fourUnew-WT-80C-dms-MaP-5S17L001R1001
YYYR-015-EKC-fourUnew-WT-80C-dms-MaP-3S15L001R1001 017-EKC-fourUnew-WT-80C-dms-MaP-5S17L001R100

068-EKC-WT-37c-b-2-p_S68_L001_R1_001 068-EKC-WT-37c-b-2-p_S68_L001_R1_001
069-EKC-WT-37c-b-3-p_S69_L001_R1_001 069-EKC-WT-37c-b-3-p_S69_L001_R1_001
070-EKC-WT-37c-b-4-p_S70_L001_R1_001 070-EKC-WT-37c-b-4-p_S70_L001_R1_001
071-EKC-WT-37c-b-5-p_S71_L001_R1_001 071-EKC-WT-37c-b-5-p_S71_L001_R1_001
072-EKC-WT-37c-b-6-p_S72_L001_R1_001 072-EKC-WT-37c-b-6-p_S72_L001_R1_001
073-EKC-WT-45c-b-1-p_S73_L001_R1_001 073-EKC-WT-45c-b-1-p_S73_L001_R1_001
074-EKC-WT-45c-b-2-p_S74_L001_R1_001 074-EKC-WT-45c-b-2-p_S74_L001_R1_001
075-EKC-WT-45c-b-3-p_S75_L001_R1_001 075-EKC-WT-45c-b-3-p_S75_L001_R1_001
076-EKC-WT-45c-b-4-p_S76_L001_R1_001 076-EKC-WT-45c-b-4-p_S76_L001_R1_001
077-EKC-WT-45c-b-5-p_S77_L001_R1_001 077-EKC-WT-45c-b-5-p_S77_L001_R1_001
078-EKC-WT-45c-b-6-p_S78_L001_R1_001 078-EKC-WT-45c-b-6-p_S78_L001_R1_001
079-EKC-WT-50c-a-1-p_S79_L001_R1_001 079-EKC-WT-50c-a-1-p_S79_L001_R1_001
080-EKC-WT-50c-a-2-p_S80_L001_R1_001 080-EKC-WT-50c-a-2-p_S80_L001_R1_001
081-EKC-WT-50c-a-3-p_S81_L001_R1_001 0

092-EKC-A8C-37c-b-6-m_S92_L001_R1_001 092-EKC-A8C-37c-b-6-m_S92_L001_R1_001
093-EKC-A8C-45c-b-6-m_S93_L001_R1_001 093-EKC-A8C-45c-b-6-m_S93_L001_R1_001
094-EKC-A8C-50c-a-6-m_S94_L001_R1_001 094-EKC-A8C-50c-a-6-m_S94_L001_R1_001
095-EKC-A8C-50c-b-6-m_S95_L001_R1_001 095-EKC-A8C-50c-b-6-m_S95_L001_R1_001
096-EKC-A8C-60c-a-6-m_S96_L001_R1_001 096-EKC-A8C-60c-a-6-m_S96_L001_R1_001
001-fourU-WT-65c-a-1-p_S1_L001_R1_001 001-fourU-WT-65c-a-1-p_S1_L001_R1_001
fmod_calc_runs_2/000000000821 ./001-fourU-WT-65c-a-1-p_S1_L001_R1_001.fastq.gz 001-fourU-WT-65c-a-1-p_S1_L001_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
002-fourU-WT-65c-a-2-p_S2_L001_R1_001 002-fourU-WT-65c-a-2-p_S2_L001_R1_001
fmod_calc_runs_2/000000000832 ./002-fourU-WT-65c-a-2-p_S2_L001_R1_001.fastq.gz 002-fourU-WT-65c-a-2-p_S2_L001_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
003-fourU-WT-65c-a-3-p_S3_L001_R1_001 003-fourU-WT-65c-a-3-p_S3_L001_R1_001
fmod_calc_runs_2/000000000843 ./003-fourU-WT

fmod_calc_runs_2/000000000865 ./049-fourU-WT-10c-a-1-p_S49_L001_R1_001.fastq.gz 049-fourU-WT-10c-a-1-p_S49_L001_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
050-fourU-WT-10c-a-2-p_S50_L001_R1_001 050-fourU-WT-10c-a-2-p_S50_L001_R1_001
fmod_calc_runs_2/000000000866 ./050-fourU-WT-10c-a-2-p_S50_L001_R1_001.fastq.gz 050-fourU-WT-10c-a-2-p_S50_L001_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
051-fourU-WT-10c-a-3-p_S51_L001_R1_001 051-fourU-WT-10c-a-3-p_S51_L001_R1_001
fmod_calc_runs_2/000000000867 ./051-fourU-WT-10c-a-3-p_S51_L001_R1_001.fastq.gz 051-fourU-WT-10c-a-3-p_S51_L001_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
052-fourU-WT-10c-a-4-p_S52_L001_R1_001 052-fourU-WT-10c-a-4-p_S52_L001_R1_001
fmod_calc_runs_2/000000000868 ./052-fourU-WT-10c-a-4-p_S52_L001_R1_001.fastq.gz 052-fourU-WT-10c-a-4-p_S52_L001_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
053-fourU-WT-10c-a-5-p_S53_L001_R1_001 053-fourU-WT-10c-a-5-p_

008_fourU_A8C_40c_a_2_p_S8_R1_001 008_fourU_A8C_40c_a_2_p_S8_R1_001
009_fourU_A8C_40c_a_3_p_S9_R1_001 009_fourU_A8C_40c_a_3_p_S9_R1_001
010_fourU_A8C_40c_a_4_p_S10_R1_001 010_fourU_A8C_40c_a_4_p_S10_R1_001
011_fourU_A8C_40c_a_5_p_S11_R1_001 011_fourU_A8C_40c_a_5_p_S11_R1_001
012_fourU_A8C_40c_a_6_p_S12_R1_001 012_fourU_A8C_40c_a_6_p_S12_R1_001
013_fourU_A8C_15c_b_1_p_S13_R1_001 013_fourU_A8C_15c_b_1_p_S13_R1_001
014_fourU_A8C_15c_b_2_p_S14_R1_001 014_fourU_A8C_15c_b_2_p_S14_R1_001
015_fourU_A8C_15c_b_3_p_S15_R1_001 015_fourU_A8C_15c_b_3_p_S15_R1_001
016_fourU_A8C_15c_b_4_p_S16_R1_001 016_fourU_A8C_15c_b_4_p_S16_R1_001
017_fourU_A8C_15c_b_5_p_S17_R1_001 017_fourU_A8C_15c_b_5_p_S17_R1_001
018_fourU_A8C_15c_b_6_p_S18_R1_001 018_fourU_A8C_15c_b_6_p_S18_R1_001
019_fourU_A8C_40c_b_1_p_S19_R1_001 019_fourU_A8C_40c_b_1_p_S19_R1_001
020_fourU_A8C_40c_b_2_p_S20_R1_001 020_fourU_A8C_40c_b_2_p_S20_R1_001
021_fourU_A8C_40c_b_3_p_S21_R1_001 021_fourU_A8C_40c_b_3_p_S21_R1_001
022_fourU_A8C_40c_b_4_p_

059fourUWT10cb6mS155R1001 059fourUWT10cb6mS155R1001
fmod_calc_runs_2/000000001067 ./059_fourU_WT_10c_b_6_m_S155_R1_001.fastq.gz 059_fourU_WT10c_b_6_m_S155_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
060fourUWT65cb6mS156R1001 060fourUWT65cb6mS156R1001
fmod_calc_runs_2/000000001068 ./060_fourU_WT_65c_b_6_m_S156_R1_001.fastq.gz 060_fourU_WT65c_b_6_m_S156_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
061fourUA8C20ca2pS157R1001 061fourUA8C20ca2pS157R1001
062fourUWT20ca6mS158R1001 062fourUWT20ca6mS158R1001
fmod_calc_runs_2/000000001070 ./062_fourU_WT_20c_a_6_m_S158_R1_001.fastq.gz 062_fourU_WT20c_a_6_m_S158_R1_001.fastq.gz
Sample name does not match R1 file, skipping...
063fourUA8C42ca6mS159R1001 063fourUA8C42ca6mS159R1001
064fourUA8C20ca6mS160R1001 064fourUA8C20ca6mS160R1001
065fourUWT15ca6mS161R1001 065fourUWT15ca6mS161R1001
fmod_calc_runs_2/000000001073 ./065_fourU_WT_15c_a_6_m_S161_R1_001.fastq.gz 065_fourU_WT15c_a_6_m_S161_R1_001.fastq.gz
Sample nam

063-HIV-UUCG-GS-70c-a-3-p_S63_L001_R1_001 063-HIV-UUCG-GS-70c-a-3-p_S63_L001_R1_001
064-HIV-UUCG-GS-70c-a-4-p_S64_L001_R1_001 064-HIV-UUCG-GS-70c-a-4-p_S64_L001_R1_001
065-HIV-UUCG-GS-70c-a-5-p_S65_L001_R1_001 065-HIV-UUCG-GS-70c-a-5-p_S65_L001_R1_001
066-HIV-UUCG-GS-70c-a-6-p_S66_L001_R1_001 066-HIV-UUCG-GS-70c-a-6-p_S66_L001_R1_001
067-HIV-UUCG-GS-70c-b-1-p_S67_L001_R1_001 067-HIV-UUCG-GS-70c-b-1-p_S67_L001_R1_001
068-HIV-UUCG-GS-70c-b-2-p_S68_L001_R1_001 068-HIV-UUCG-GS-70c-b-2-p_S68_L001_R1_001
069-HIV-UUCG-GS-70c-b-3-p_S69_L001_R1_001 069-HIV-UUCG-GS-70c-b-3-p_S69_L001_R1_001
070-HIV-UUCG-GS-70c-b-4-p_S70_L001_R1_001 070-HIV-UUCG-GS-70c-b-4-p_S70_L001_R1_001
071-HIV-UUCG-GS-70c-b-5-p_S71_L001_R1_001 071-HIV-UUCG-GS-70c-b-5-p_S71_L001_R1_001
072-HIV-UUCG-GS-70c-b-6-p_S72_L001_R1_001 072-HIV-UUCG-GS-70c-b-6-p_S72_L001_R1_001
073-HIV-UUCG-GS-75c-a-1-p_S73_L001_R1_001 073-HIV-UUCG-GS-75c-a-1-p_S73_L001_R1_001
074-HIV-UUCG-GS-75c-a-2-p_S74_L001_R1_001 074-HIV-UUCG-GS-75c-a-2-p_S74_L001

087-HIV-A35G-80c-a-3-p_S87_L001_R1_001 087-HIV-A35G-80c-a-3-p_S87_L001_R1_001
088-HIV-A35G-80c-a-4-p_S88_L001_R1_001 088-HIV-A35G-80c-a-4-p_S88_L001_R1_001
089-HIV-A35G-80c-a-5-p_S89_L001_R1_001 089-HIV-A35G-80c-a-5-p_S89_L001_R1_001
090-HIV-A35G-80c-a-6-p_S90_L001_R1_001 090-HIV-A35G-80c-a-6-p_S90_L001_R1_001
091-HIV-A35G-80c-b-6-p_S91_L001_R1_001 091-HIV-A35G-80c-b-6-p_S91_L001_R1_001
092-HIV-A35G-80c-b-6-p_S92_L001_R1_001 092-HIV-A35G-80c-b-6-p_S92_L001_R1_001
093-HIV-A35G-80c-b-6-p_S93_L001_R1_001 093-HIV-A35G-80c-b-6-p_S93_L001_R1_001
094-HIV-A35G-80c-b-6-p_S94_L001_R1_001 094-HIV-A35G-80c-b-6-p_S94_L001_R1_001
095-HIV-A35G-80c-b-6-p_S95_L001_R1_001 095-HIV-A35G-80c-b-6-p_S95_L001_R1_001
096-HIV-A35G-80c-b-6-p_S96_L001_R1_001 096-HIV-A35G-80c-b-6-p_S96_L001_R1_001
001-HIV-C30U-25c-a-1-p_S1_L001_R1_001 001-HIV-C30U-25c-a-1-p_S1_L001_R1_001
002-HIV-C30U-25c-a-2-p_S2_L001_R1_001 002-HIV-C30U-25c-a-2-p_S2_L001_R1_001
003-HIV-C30U-25c-a-3-p_S3_L001_R1_001 003-HIV-C30U-25c-a-3-p_S3_L001

Unnamed: 0,rxn_group,temperature,replicate,reaction_time,probe_concentration,probe,buffer,construct,RT,done_by,treated,sequencing_run,sample_name,fq_dir,fmod_runs
105,37_2,37,2,60,0.015853,dms,6,25,MRT,RB,1,17,DMS-1-WTII_S2_L001_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/22120...,fmod_calc_runs_2/000000000258
106,37_2,37,2,7200,0.015853,dms,6,25,MRT,RB,1,17,DMS-120-WTII_S7_L001_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/22120...,fmod_calc_runs_2/000000000263
107,37_2,37,2,9000,0.015853,dms,6,25,MRT,RB,1,17,DMS-150-WTII_S8_L001_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/22120...,fmod_calc_runs_2/000000000264
108,37_2,37,2,1200,0.015853,dms,6,25,MRT,RB,1,17,DMS-20-WTII_S3_L001_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/22120...,fmod_calc_runs_2/000000000259
109,37_2,37,2,2400,0.015853,dms,6,25,MRT,RB,1,17,DMS-40-WTII_S4_L001_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/22120...,fmod_calc_runs_2/000000000260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,33_1,33,1,11100,0.000000,etoh,6,25,MRT,EKC,0,33,089_fourU_WT33c_a_6_m_S185_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/24042...,fmod_calc_runs_2/000000001097
772,80_3,80,3,80,0.000000,etoh,6,25,MRT,EKC,0,33,090_fourU_WT80c_a_6_m_S186_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/24042...,fmod_calc_runs_2/000000001098
774,37_5,37,5,7080,0.000000,etoh,6,25,MRT,EKC,0,33,092_fourU_WT37c_b_6_m_S188_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/24042...,fmod_calc_runs_2/000000001100
777,33_2,33,2,11100,0.000000,etoh,6,25,MRT,EKC,0,33,095_fourU_WT33c_b_6_m_S191_R1_001.fastq.gz,/projects/b1044/Sequencing/SHAPE-Seq/EKC/24042...,fmod_calc_runs_2/000000001103


In [202]:
def extract_info_from_spats(log_file, config_file, sample_name):
    """
    Extracts information from a spats directory.

    Parameters:
        log_file (str): Path to the ShapeMapper log file.
        sample_name (str): Name of the sample to check against the R1 file.

    Returns:
        tuple: Contains the following elements:
            - run_datetime (str): The datetime when the ShapeMapper run started.
            - version (str): The version of ShapeMapper used.
            - r1_file (str): The R1 file used in the run.
            - untreated (int): Indicates if the sample was untreated (1 if untreated, 0 otherwise).
            - denatured (int): Indicates if the sample was denatured (1 if denatured, 0 otherwise).
            - sample_check (bool): Indicates if the sample name matches the R1 file.
    """

    with open(log_file) as f:
        log_lines = f.readlines()
    
    with open(config_file) as f:
        config_lines = f.readlines()

    # find all lines containing " : run " and get index of most recent one
    detect_spats_runs = [i for i, line in enumerate(log_lines) if ' : run' in line]
    assert len(detect_spats_runs) > 0, 'No spats runs detected in log file'

    most_recent_run = detect_spats_runs[-1]
    log_lines = log_lines[most_recent_run:]

    # check spats success
    # TODO

    #print(log_file)
    # extract date from:  "2022/11/26 11:56 : run, 172.31s"
    run_datetime = log_lines[0].split(' : run')[0]
    version = 'v2.0.5'
    
    # extract run args
    run_args = [i for i, line in enumerate(log_lines) if not line.startswith('#')]
    run_args = '\n'.join(run_args)
    
    # extract R1 file
    r1_lines = [line for line in config_lines if 'r1' in line]
    assert (len(r1_lines) == 1), 'R1 line not found in config file'

    r1_file = r1_lines[0].split('=')[-1]
    r1_file = r1_file.replace(' ', '').rstrip()
    assert (r1_file is not None) or (r1_file == ''), 'R1 file not found in run_args'
    
    # confirm sample_name matches r1_file
    if r1_file.endswith('.fastq.gz'):
        r1_file_check = r1_file[:-9]
    elif r1_file.endswith('.fastq'):
        r1_file_check = r1_file[:-6]
    else:
        r1_file_check = r1_file
    if sample_name.endswith('.fastq.gz'):
        sample_name_check = sample_name[:-9]
    elif sample_name.endswith('.fastq'):
        sample_name_check = sample_name[:-6]
    else:
        sample_name_check = sample_name
    if r1_file.startswith('./'):
        r1_file_check = r1_file_check[2:]

    sample_check = (sample_name_check == r1_file_check)
    
    return run_datetime, version, r1_file, sample_check

In [203]:
def construct_fmod_calc_run_spats(sample_name, fmod_dir, db_file):

    
    log_file = glob.glob(f'/projects/b1044/Computational_Output/EKC/{fmod_dir}/**/spats.log', recursive = True)[0]
    config_file = glob.glob(f'/projects/b1044/Computational_Output/EKC/{fmod_dir}/**/spats.config', recursive = True)[0]
    run_datetime, version, r1_file, sample_check = extract_info_from_spats(log_file, config_file, sample_name)
    s_id = fetch_s_id(db_file, sample_name)

    # get potential fmod_calc id but do not add until fmod vals are good

    # Tentative fmod_calc id (pending fmod_vals check)
    fmod_calc_id = get_max_id(db_file, 'fmod_calc_runs', 'id')

    profile_txt = glob.glob(f'/projects/b1044/Computational_Output/EKC/{fmod_dir}/**/*.csv', recursive=True)
    # exclude reads.csv
    profile_txt = [x for x in profile_txt if 'reads' not in x]
    
    # choose profile with "reanalyzed"
    if len(profile_txt) > 1:
        profile_txt = [x for x in profile_txt if 'reanalyzed' in x]
        #print(fmod_dir, profile_txt)
    assert len(profile_txt) == 1, 'Multiple or no profile.txt files found'
    profile_txt = profile_txt[0]

    return run_datetime, version, r1_file, sample_check, s_id, fmod_calc_id, profile_txt


def construct_fmod_vals_spats(profile_txt, db_file, s_id, fmod_calc_id):
    # read the csv file
    df = pd.read_csv(profile_txt)
    read_depth = df.iloc[0, 3]#['f+']
    df = df.iloc[1:, :]

    seq_from_profile = ''.join(df['nt'].values)
    dict_convertTU = {'T': 'U', 't': 'u'}
    seq_from_profile = ''.join([dict_convertTU.get(base, base) for base in seq_from_profile])
    
    construct_seq = fetch_construct_seq(db_file, s_id)
    assert construct_seq.upper() == seq_from_profile.upper(), 'Construct sequence does not match spats out.csv sequence'

    nt_ids, nt_seq = fetch_nt_ids(db_file, s_id)
    assert nt_seq.upper() == seq_from_profile.upper(), 'Nt sequence does not match profile.txt sequence'

    rxn_id, rxn_treated = fetch_rxn_id(db_file, s_id)

    fmod_vals_df = pd.DataFrame({'nt_id': nt_ids, 'fmod_calc_run_id': fmod_calc_id, 'fmod_val': df['beta'], 'valtype': 'beta', 'read_depth': read_depth, 'rxn_id': rxn_id})
    return fmod_vals_df

In [204]:
samples = pd.read_csv('/projects/b1044/Computational_Output/EKC/EKC.01_SHAPE_standardization/EKC.01.060.developing_DB_input/samples_import.csv')

i = 5
# check if sample_name column has not repeats
fmod_dir = samples[samples['RT'] == 'SSIII']['fmod_runs'].values[i]
sample_name = samples[samples['RT'] == 'SSIII']['sample_name'].values[i]

# get text inside single quote '
if "'" in fmod_dir:
    fmod_dir = fmod_dir.split("'")[1]
print(fmod_dir)
print(sample_name)
db_file = '/projects/b1044/Computational_Output/EKC/EKC.01_SHAPE_standardization/EKC.01.060.developing_DB_input/new.db'
try:
    run_datetime, version, r1_file, sample_check, s_id, fmod_calc_id, profile_txt = construct_fmod_calc_run_spats(sample_name, fmod_dir, db_file)
except:
     print('Error in log file, skipping...')
if sample_check:
     fmod_vals_df = construct_fmod_vals_spats(profile_txt, db_file, s_id, fmod_calc_id)
else:
     print('Sample name does not match R1 file, skipping...')

fmod_calc_runs_2/000000000189
006-EKC-fourUnew-WT-II-37C-dms-stop-90_S6_L001_R1_001.fastq.gz
Error in log file, skipping...


IndexError: index 3 is out of bounds for axis 0 with size 1

In [118]:
ssiii_only = samples[~((samples['RT'] == 'MRT') | (samples['RT'] == 'MRTpH9'))]
# drop nan values 
ssiii_only = ssiii_only.dropna(subset=['fmod_runs'])

for i, row in ssiii_only.iterrows():
    fmod_dir = row['fmod_runs']
    sample_name = row['sample_name']

    # get text inside single quote '
    if "'" in fmod_dir:
        fmod_dir = fmod_dir.split("'")[1]
    #print(fmod_dir)
    #print(sample_name)
    
    try:
        run_datetime, version, r1_file, sample_check, s_id, fmod_calc_id, profile_txt = construct_fmod_calc_run_spats(sample_name, fmod_dir, db_file)
    except:
        print('---------------------------------------------')
        print(sample_name)
        traceback.print_exc()
        print('---------------------------------------------')
        continue

    if sample_check:
        fmod_vals_df = construct_fmod_vals_spats(profile_txt, db_file, s_id, fmod_calc_id)
    else:
        print(fmod_dir, r1_file, sample_name)
        print('Sample name does not match R1 file, skipping...')
        continue
    
    # simulate processed by dropping row
    ssiii_only = ssiii_only.drop(index=i)
    
ssiii_only

Unnamed: 0,rxn_group,temperature,replicate,reaction_time,probe_concentration,probe,buffer,construct,RT,done_by,treated,sequencing_run,sample_name,fq_dir,fmod_runs


In [None]:
import subprocess

# mv from src to dest

def move_directory_unix(src_dir, dest_dir):
    # mv automatically handles directories and their contents without the need for -r
    subprocess.run(['mv', src_dir, dest_dir], check=True)
    print(f"Moved {src_dir} to {dest_dir} using mv command")

for i, row in df.iterrows():
    move_directory_unix(row['src_dir'], row['dest_dir'])