In [180]:
import pandas as pd
from pathlib import Path
import sys
import os
import numpy as np
import re

sys.path.append(os.path.abspath('../src'))
project_dir = project_dir = Path.cwd().parent

In [181]:
num_serotypes = 4
denv_df = pd.DataFrame({
        "index": np.array([], dtype=int),
        'serotype': [],
        "EPI": []
    })


for s in range(1,num_serotypes + 1):
    denv_raw = pd.read_csv(project_dir / "data" /  "raw" / f"DENV-{s}.csv", delimiter=";")
    EPI = denv_raw['seqName'].str.split('|', expand=True) [1]

    new_rows = pd.DataFrame({
        "index": denv_raw['index'],
        'serotype': f"DENV-{s}",
        "EPI": EPI
    })
    denv_df = pd.concat([denv_df, new_rows])

def extract_serotype(s):
    match = re.search(r'DenV(\d)', s, re.IGNORECASE)
    if match:
        return f"DENV-{match.group(1)}"
    else:
        return None  # or some default


if os.path.exists(project_dir / "data" /  "raw" /"unclassified.tsv"):
    unclassified_raw = pd.read_csv(project_dir / "data" /  "raw" /"unclassified.tsv", sep='\t')
    EPI = unclassified_raw['seqName'].str.split('|', expand=True)[1]
    seq = unclassified_raw['seqName'].str.split('|', expand=True)[0]
    seq = seq.str.split('/', expand=True)[0]
    serotypes = np.array([extract_serotype(x) for x in seq])
    new_rows = pd.DataFrame({
        "index": unclassified_raw['index'],
        'serotype': serotypes,
        "EPI": EPI
    })
    denv_df = pd.concat([denv_df, new_rows])

assert len(set(list(range(0, len(denv_df)))) - set(denv_df['index'])) == 0, f"Unaccounted for samples"
denv_df = denv_df.set_index('index').sort_index()
denv_dates = pd.read_csv(project_dir / "data" /  "raw" /"DENV_dates.tsv", sep='\t')
denv_df = denv_df.merge(denv_dates, left_on='EPI', right_on='Accession ID', how='left')
denv_df.drop(columns='Accession ID')



Unnamed: 0,serotype,EPI,Collection date,Submission date,Location
0,DENV-2,EPI_ISL_11579708,2019-05-28,2022-03-31,South America / Brazil
1,DENV-2,EPI_ISL_11579724,2019-05-20,2022-03-31,South America / Brazil
2,DENV-2,EPI_ISL_11579725,2019-05-09,2022-03-31,South America / Brazil
3,DENV-2,EPI_ISL_11579726,2019-05-09,2022-03-31,South America / Brazil
4,DENV-2,EPI_ISL_11579754,2019-05-08,2022-03-31,South America / Brazil
...,...,...,...,...,...
1378,DENV-2,EPI_ISL_808255,2019-05-10,2020-08-26,South America / Brazil / Goias / Goiania
1379,DENV-2,EPI_ISL_808256,2019-04-29,2020-08-26,South America / Brazil / Goias / Goiania
1380,DENV-2,EPI_ISL_808257,2019-04-25,2020-08-26,South America / Brazil / Goias / Goiania
1381,DENV-1,EPI_ISL_808258,2019-04-09,2020-08-26,South America / Brazil / Goias / Goiania


In [190]:
# Note some dates just contain years
def fix_date(val):
    if isinstance(val, str):
        val = val.strip()

        # Year only: "2018"
        if re.fullmatch(r"\d{4}", val):
            return f"{val}-01-01"
        
        # Year-Month: "2018-07" or "2018/07"
        if re.fullmatch(r"\d{4}[-/]\d{1,2}", val):
            parts = re.split("[-/]", val)
            year, month = parts
            return f"{year}-{int(month):02d}-01"
        
        # Month-Year: "07/2018" or "07-2018"
        if re.fullmatch(r"\d{1,2}[-/]\d{4}", val):
            parts = re.split("[-/]", val)
            month, year = parts
            return f"{year}-{int(month):02d}-01"
        
    return val  # Leave as-is if not matched



denv_df['Collection date'] = denv_df['Collection date'].apply(fix_date)
denv_df['Submission date'] = denv_df['Submission date'].apply(fix_date)

denv_df['Collection date'] = pd.to_datetime(denv_df['Collection date'])
denv_df['Submission date'] = pd.to_datetime(denv_df['Submission date'])
denv_df['Delay (months)'] = ((denv_df['Submission date'] - denv_df['Collection date']).dt.days / 30.44).round().astype('Int64')

denv_df



Unnamed: 0,serotype,EPI,Accession ID,Collection date,Submission date,Location,Delay (months)
0,DENV-2,EPI_ISL_11579708,EPI_ISL_11579708,2019-05-28,2022-03-31,South America / Brazil,34
1,DENV-2,EPI_ISL_11579724,EPI_ISL_11579724,2019-05-20,2022-03-31,South America / Brazil,34
2,DENV-2,EPI_ISL_11579725,EPI_ISL_11579725,2019-05-09,2022-03-31,South America / Brazil,35
3,DENV-2,EPI_ISL_11579726,EPI_ISL_11579726,2019-05-09,2022-03-31,South America / Brazil,35
4,DENV-2,EPI_ISL_11579754,EPI_ISL_11579754,2019-05-08,2022-03-31,South America / Brazil,35
...,...,...,...,...,...,...,...
1378,DENV-2,EPI_ISL_808255,EPI_ISL_808255,2019-05-10,2020-08-26,South America / Brazil / Goias / Goiania,16
1379,DENV-2,EPI_ISL_808256,EPI_ISL_808256,2019-04-29,2020-08-26,South America / Brazil / Goias / Goiania,16
1380,DENV-2,EPI_ISL_808257,EPI_ISL_808257,2019-04-25,2020-08-26,South America / Brazil / Goias / Goiania,16
1381,DENV-1,EPI_ISL_808258,EPI_ISL_808258,2019-04-09,2020-08-26,South America / Brazil / Goias / Goiania,17
