In [1]:
import pandas as pd
import numpy as np

In [2]:
def random_undersample(data_df, max_samples_per_class=100, random_state=42):
    
    data_df = pd.DataFrame(data)
    
    undersampled_data = []

    for class_value, group in data_df.groupby('Lineage'):
        if len(group) > max_samples_per_class:
            undersampled_group = group.sample(n=max_samples_per_class, random_state=random_state)
        else:
            undersampled_group = group
        undersampled_data.append(undersampled_group)

    undersampled_data_df = pd.concat(undersampled_data)
    undersampled_data_df = undersampled_data_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return undersampled_data_df

In [3]:
# Read the tab-delimited file into a DataFrame
df = pd.read_csv('../../data/processed/results.txt', delimiter='\t',  skiprows=1)
df = df.drop_duplicates(subset=['Name'], keep='first')
df = df.drop(columns=['#', 'Patient Code', 'PAT id(SSAM)', 'Organism', 'Name'])

# Rename the columns
df = df.rename(columns={'HXB2/MAC239 start': 'Start', 'HXB2/MAC239 stop': 'Stop', 'Sampling Year': 'Year', 'Subtype':'Lineage'})
# Convert the 'Sequence' column to uppercase
df['Sequence'] = df['Sequence'].str.upper()

# Display the cleaned DataFrame
display(df)

Unnamed: 0,Se ID,Accession,Lineage,Country,Year,Start,Stop,Sequence Length,Sequence
0,149746,A04321,B,FRANCE,1983.0,455,9636,9193,GGTCTCTCNNGTTAGACCAGATTTGAGCCTGGNAGCTCTCTGGCTA...
1,249722,A07867,B,FRANCE,1983.0,455,9636,9193,GGTCTCTCTGGTTAGACCAGATTTGAGCCTGGGAGCTCTCTGGCTA...
2,229443,AB023804,C,INDIA,1993.0,1,9702,9680,TGGAAGGGTTAATTTACTCCAAGAAAAGGCAAGAAATCCTTGATTT...
3,114323,AB032740,01_AE,THAILAND,1995.0,49,9473,9427,GGGTCTATAATACACAAGGCTTTTTCCCTGATTGGCAAAACTACAC...
4,114322,AB032741,01_AE,THAILAND,1995.0,49,9464,9430,GGGTCTATAATACACAAGGCTTCTTTCCTGATTGGCAAAACTACAC...
...,...,...,...,...,...,...,...,...,...
20434,233870,U88825,02G,NIGERIA,1992.0,630,9635,8966,TTGAAAGCGAAAGTTAACAGGGACTCGAAAGCGAAAGTTCCAGAGA...
20435,240467,U88826,G,NIGERIA,1992.0,630,9635,8987,ATGAAAGCGAAAGTTAATAGGGACTCGAAAACGAAAGTTCCAGAGA...
20436,2413,X01762,B,FRANCE,1983.0,1,9719,9748,TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCT...
20437,147318,X04415,A1DK,DEM REP OF CONGO,1985.0,455,9637,9229,GGTCTCTCTTGTTAGACCAGGTCGAGCCCGGGAGCTCTCTGGCTAG...


In [4]:
# Filter subtypes occurring 10 or more times
data = df[df['Lineage'].map(df['Lineage'].value_counts()) >= 10]

# Display the first few rows of the filtered DataFrame
display(data)

Unnamed: 0,Se ID,Accession,Lineage,Country,Year,Start,Stop,Sequence Length,Sequence
0,149746,A04321,B,FRANCE,1983.0,455,9636,9193,GGTCTCTCNNGTTAGACCAGATTTGAGCCTGGNAGCTCTCTGGCTA...
1,249722,A07867,B,FRANCE,1983.0,455,9636,9193,GGTCTCTCTGGTTAGACCAGATTTGAGCCTGGGAGCTCTCTGGCTA...
2,229443,AB023804,C,INDIA,1993.0,1,9702,9680,TGGAAGGGTTAATTTACTCCAAGAAAAGGCAAGAAATCCTTGATTT...
3,114323,AB032740,01_AE,THAILAND,1995.0,49,9473,9427,GGGTCTATAATACACAAGGCTTTTTCCCTGATTGGCAAAACTACAC...
4,114322,AB032741,01_AE,THAILAND,1995.0,49,9464,9430,GGGTCTATAATACACAAGGCTTCTTTCCTGATTGGCAAAACTACAC...
...,...,...,...,...,...,...,...,...,...
20433,233871,U88824,D,UGANDA,1994.0,654,9635,8952,CTGAAAGCGAAAGTAGAACCAGAGGAGATCTCTCGACGCAGGACTC...
20434,233870,U88825,02G,NIGERIA,1992.0,630,9635,8966,TTGAAAGCGAAAGTTAACAGGGACTCGAAAGCGAAAGTTCCAGAGA...
20435,240467,U88826,G,NIGERIA,1992.0,630,9635,8987,ATGAAAGCGAAAGTTAATAGGGACTCGAAAACGAAAGTTCCAGAGA...
20436,2413,X01762,B,FRANCE,1983.0,1,9719,9748,TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCT...


In [5]:
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame and 'Lineage' is the target class column
X = data.drop('Lineage', axis=1)  # Features
y = data['Lineage']  # Target variable

# First, split into temporary train/validate (70%) and test (30%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)

# Then, split the temporary set into final train (50/70 ≈ 0.7143) and validate (20/70 ≈ 0.2857)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2857, stratify=y_temp, random_state=42)

X_train['Lineage'] = y_train
X_val['Lineage'] = y_val
X_test['Lineage'] = y_test
X_val['Train'] = 1
X_test['Train'] = 2

X_train = random_undersample(X_train)
X_train['Train'] = 0

frames = [X_train, X_val, X_test]

data = pd.concat(frames)

In [6]:
data.to_parquet('../../data/processed/HIV.parquet', engine='pyarrow')