In [10]:
import pandas as pd
import numpy as np

In [11]:
data = pd.read_parquet('../../../data/sequences/HIV_full.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine

In [12]:
data = data[~data.apply(lambda x: x.astype(str).str.contains('\(')).any(axis=1)]

# Remove entries with "Unassigned" in the Lineage field
data = data[data["Lineage"] != "Unassigned"]

data

Unnamed: 0,Se ID,Accession,Lineage,Country,Year,Start,Stop,Sequence Length,Sequence,Train
0,100908,AY371156,D,CAMEROON,2001.0,796,9190,8355,GCGAGAGCGTCAGTATTAAGCGGGGGAAAATTGGATCAATGGGAAA...,0
1,499283,JX447680,01_AE,THAILAND,2006.0,582,9605,9020,TGGTAACTAGAGATCCCTCAGACCACTCTAGACTGAGTAAAAATCT...,0
2,964060,MN791674,A1CD,KENYA,2011.0,790,9417,8669,ATGGGCGCGAGAGCGTCAGTATTAAGCGGCGGAAAATTAGATGCAT...,0
3,577949,KF835506,08_BC,CHINA,2007.0,635,9601,8959,GTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAGACCAGAGAAG...,0
4,549910,KC898984,BC,CHINA,2009.0,790,9595,8751,ATGGGTGCGAGAGCGTCCGTATTAAGCGGGGGAGAATTAGATAGGT...,0
...,...,...,...,...,...,...,...,...,...,...
19081,1202866,OQ513581,07BC,CHINA,2014.0,772,9601,8706,GGGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGAG...,2
10128,951685,MK867569,01_AE,THAILAND,,789,9496,8662,GATGGGTGCGAGAGCGTCAGTATTAAGTGGGGGAAAATTAGATGCA...,2
1888,283171,FJ388920,B,CYPRUS,2005.0,790,8930,8194,ATGGGTGCGAGAGCGTCAATATTAAGCGGCGGACAATTAGATCAAT...,2
8035,902688,MH843885,B,UNITED STATES,2013.0,684,9648,8956,TCTCGACGCAGGACTCGGCTTGCTGAAAGCGCGCACAGCAAGAGGC...,2


In [13]:
chars_to_count = ['W', 'S', 'M', 'K', 'R', 'Y', 'B', 'D', 'H', 'V', 'N']

def calculate_percentage(sequence):
    count = sum(sequence.count(char) for char in chars_to_count)
    total = len(sequence.replace('.', ''))  # Adjusting total length for meaningful characters only
    return (count / total) * 100 if total > 0 else 0

# Calculate the percentage for each sequence and add it as a new field
data['Percentage'] = data['Sequence'].apply(calculate_percentage)
data = data[data["Percentage"] < 5]
data['Coverage'] = 100 - data['Percentage']

In [14]:
data

Unnamed: 0,Se ID,Accession,Lineage,Country,Year,Start,Stop,Sequence Length,Sequence,Train,Percentage,Coverage
0,100908,AY371156,D,CAMEROON,2001.0,796,9190,8355,GCGAGAGCGTCAGTATTAAGCGGGGGAAAATTGGATCAATGGGAAA...,0,0.000000,100.000000
1,499283,JX447680,01_AE,THAILAND,2006.0,582,9605,9020,TGGTAACTAGAGATCCCTCAGACCACTCTAGACTGAGTAAAAATCT...,0,0.033259,99.966741
2,964060,MN791674,A1CD,KENYA,2011.0,790,9417,8669,ATGGGCGCGAGAGCGTCAGTATTAAGCGGCGGAAAATTAGATGCAT...,0,0.011535,99.988465
3,577949,KF835506,08_BC,CHINA,2007.0,635,9601,8959,GTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAGACCAGAGAAG...,0,0.000000,100.000000
4,549910,KC898984,BC,CHINA,2009.0,790,9595,8751,ATGGGTGCGAGAGCGTCCGTATTAAGCGGGGGAGAATTAGATAGGT...,0,0.057136,99.942864
...,...,...,...,...,...,...,...,...,...,...,...,...
19081,1202866,OQ513581,07BC,CHINA,2014.0,772,9601,8706,GGGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGAG...,2,0.000000,100.000000
10128,951685,MK867569,01_AE,THAILAND,,789,9496,8662,GATGGGTGCGAGAGCGTCAGTATTAAGTGGGGGAAAATTAGATGCA...,2,0.000000,100.000000
1888,283171,FJ388920,B,CYPRUS,2005.0,790,8930,8194,ATGGGTGCGAGAGCGTCAATATTAAGCGGCGGACAATTAGATCAAT...,2,0.000000,100.000000
8035,902688,MH843885,B,UNITED STATES,2013.0,684,9648,8956,TCTCGACGCAGGACTCGGCTTGCTGAAAGCGCGCACAGCAAGAGGC...,2,0.000000,100.000000


In [15]:
data.to_parquet('../../../data/sequences/HIV_full.parquet', engine='pyarrow')