In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('filtered_metadata.csv')

# Display the first few rows to verify the data
print(df.head())

       Accession ID Collection date                          Location  \
0  EPI_ISL_14729181      2022-06-20  Europe / Spain / Castilla y Leon   
1  EPI_ISL_14729175      2022-06-20  Europe / Spain / Castilla y Leon   
2  EPI_ISL_14729171      2022-06-17  Europe / Spain / Castilla y Leon   
3  EPI_ISL_14729125      2022-06-19  Europe / Spain / Castilla y Leon   
4  EPI_ISL_14729180      2022-06-20  Europe / Spain / Castilla y Leon   

   Sequence length Pango lineage  
0            29782        BA.5.1  
1            29746        BA.5.1  
2            29746      BA.5.1.3  
3            29746        BA.5.1  
4            29784        BA.5.1  


In [2]:
df = df[~df.apply(lambda x: x.astype(str).str.contains('\(')).any(axis=1)]

# Remove entries with "Unassigned" in the Lineage field
df = df[df["Pango lineage"] != "Unassigned"]

df

Unnamed: 0,Accession ID,Collection date,Location,Sequence length,Pango lineage
0,EPI_ISL_14729181,2022-06-20,Europe / Spain / Castilla y Leon,29782,BA.5.1
1,EPI_ISL_14729175,2022-06-20,Europe / Spain / Castilla y Leon,29746,BA.5.1
2,EPI_ISL_14729171,2022-06-17,Europe / Spain / Castilla y Leon,29746,BA.5.1.3
3,EPI_ISL_14729125,2022-06-19,Europe / Spain / Castilla y Leon,29746,BA.5.1
4,EPI_ISL_14729180,2022-06-20,Europe / Spain / Castilla y Leon,29784,BA.5.1
...,...,...,...,...,...
15100917,EPI_ISL_14729131,2022-06-14,Europe / Spain / Castilla y Leon,29788,BA.5.3
15100918,EPI_ISL_14729213,2022-06-28,Europe / Spain / Castilla y Leon,29746,BA.5.1.30
15100919,EPI_ISL_14729158,2022-06-21,Europe / Spain / Castilla y Leon,29773,BA.5.1
15100920,EPI_ISL_14729153,2022-06-17,Europe / Spain / Castilla y Leon,29784,BA.5.1


In [4]:
def limit_min(df):
    lineage_counts = df['Pango lineage'].value_counts()

    # Find lineages with 10 or more occurrences
    valid_lineages = lineage_counts[lineage_counts >= 10].index

    # Filter the DataFrame to keep only rows with Pango Lineages having 10 or more observations
    filtered_df = df[df['Pango lineage'].isin(valid_lineages)]
    return filtered_df

def sample_or_keep(group):
    if len(group) > 100:
        return group.sample(n=100)  # Randomly sample 100 entries if more than 100
    return group  # Return the group unchanged if 100 or fewer

def limit_max(df):
    # Group by 'Pango lineage' and apply the sampling function
    sampled_df = df.groupby('Pango lineage', group_keys=False).apply(sample_or_keep).reset_index(drop=True)

    return sampled_df

df = limit_min(df)
df = limit_max(df)

  sampled_df = df.groupby('Pango lineage', group_keys=False).apply(sample_or_keep).reset_index(drop=True)


In [14]:
df

Unnamed: 0,Accession ID,Collection date,Location,Sequence length,Pango lineage
0,EPI_ISL_434103,2020-04-02,North America / USA / Washington / Yakima County,29869,A
1,EPI_ISL_479951,2020-03,Asia / Japan / Kansai,29827,A
2,EPI_ISL_698280,2020-05-18,Asia / United Arab Emirates / Abu Dhabi,29891,A
3,EPI_ISL_454387,2020-04-04,North America / USA / Pennsylvania / Allegheny...,29840,A
4,EPI_ISL_1713437,2020-01,Asia / Qatar / Doha,29782,A
...,...,...,...,...,...
296170,EPI_ISL_814903,2020-12-14,Europe / United Kingdom / Wales,29865,Z.1
296171,EPI_ISL_703445,2020-11-23,Europe / United Kingdom / England,29782,Z.1
296172,EPI_ISL_703858,2020-11-17,Europe / United Kingdom / Wales,29865,Z.1
296173,EPI_ISL_742513,2020-12-07,Europe / United Kingdom / Wales,29865,Z.1


In [5]:
df.to_csv('mock_meta.csv')