In [1]:
import pandas as pd
import os
import sys
import glob

In [2]:
dbs_path = "/home/ubuntu/poppunk/poppunk_db"
prev_db = "GPS_v9"
new_db = "GPS_v9_1"

In [3]:
def get_cluster_path(dbs_path, db_name, in_db, external):
    path_list = [dbs_path]

    if in_db:
        path_list.append(db_name)

    if external:
        path_list.append(f"{db_name}_external_clusters.csv")
    else:
        path_list.append(f"{db_name}_clusters.csv")

    return os.path.join(*path_list)


def get_df(csv_path):
    return pd.read_csv(csv_path, dtype=str, keep_default_na=False)


def split_merged_clusters(clusters_series, external):
    if external:
        delimiter = ";"
    else:
        delimiter = "_"
    
    return {int(val) for cluster in clusters_series.unique() for val in cluster.split(delimiter)}


def get_merged_gpscs(df):
    merged_gpscs = set()

    for index, row in df.iterrows():
        # Skip 235;9 in merge cluster joint as it is an exceptional case
        if row["GPSC"] == "235;9":
            continue
        gpsc_splitted = set(row["GPSC"].split(';'))
        merge_history_splitted = set(row["merge_history"].split(';'))

        base_gpsc = min(gpsc_splitted, key=int)
        if base_gpsc not in merge_history_splitted:
            sys.exit(f"Error:{base_gpsc} not found in merge_history {merge_history_splitted} of {row["sample"]}")

        clustered_gpscs = gpsc_splitted.union(merge_history_splitted)

        if len(clustered_gpscs) > 1:
            merged_gpscs.add(frozenset(clustered_gpscs))

    merged_gpscs = [set(merged_gpsc) for merged_gpsc in merged_gpscs]

    new_merge = True

    while new_merge:
        new_merge = False
        next_list = []
        while merged_gpscs:
            cur_merged_gpsc = merged_gpscs.pop()
            for i, merged_gpsc in enumerate(next_list):
                if cur_merged_gpsc.intersection(merged_gpsc):
                    next_list[i] = cur_merged_gpsc.union(merged_gpsc)
                    new_merge = True
                    break
            else:
                next_list.append(cur_merged_gpsc)
        merged_gpscs = next_list
    
    return merged_gpscs


def get_merged_gpscs_info(merged_gpscs):
    merged_gpscs_info_dict = dict()

    for cluster in merged_gpscs:
        base_gpsc = min(cluster, key=int)
        merge_history = ';'.join(sorted(cluster, key=int))
        for gpsc in cluster:
            merged_gpscs_info_dict.update({gpsc: {'merge_history': merge_history, 'base_gpsc': base_gpsc}})
    return merged_gpscs_info_dict


def base_gpsc_merge_history_correction(df, merged_gpscs_info_dict):
    df = df.copy()

    exception_mask = df["GPSC"] == "235;9"

    base_gpsc = df["GPSC"].str.split(";").apply(lambda gpsc: min(gpsc, key=int))

    df["GPSC"] = df["merge_history"] = base_gpsc

    gpsc_in_dict_mask = base_gpsc.isin(merged_gpscs_info_dict)

    df.loc[gpsc_in_dict_mask, "GPSC"] = base_gpsc[gpsc_in_dict_mask].map(lambda gpsc: merged_gpscs_info_dict[gpsc]["base_gpsc"])
    df.loc[gpsc_in_dict_mask, "merge_history"] = base_gpsc[gpsc_in_dict_mask].map(lambda gpsc: merged_gpscs_info_dict[gpsc]["merge_history"])

    df.loc[exception_mask, "GPSC"] = df.loc[exception_mask, "merge_history"] = "235;9"

    return df

In [4]:
# Get cluster ile paths and load into dataframes

prev_db_poppunk_cluster_path = get_cluster_path(dbs_path, prev_db, in_db=True, external=False)
prev_db_gpsc_path = get_cluster_path(dbs_path, prev_db, in_db=False, external=True)
new_db_poppunk_cluster_path = get_cluster_path(dbs_path, new_db, in_db=True, external=False)
new_db_gpsc_path = get_cluster_path(dbs_path, new_db, in_db=True, external=True)
new_db_gpsc_output_path = get_cluster_path(dbs_path, new_db, in_db=False, external=True)

df_prev_db_poppunk_cluster = get_df(prev_db_poppunk_cluster_path)
df_prev_db_gpsc = get_df(prev_db_gpsc_path)
df_new_db_poppunk_cluster = get_df(new_db_poppunk_cluster_path)
df_new_db_gpsc = get_df(new_db_gpsc_path)

In [5]:
# Split all merged clusters in PopPUNK clusters and GPSC, assign GPSCs to new PopPUNK clusters, print information

prev_gpscs_history_splitted = split_merged_clusters(df_prev_db_gpsc["merge_history"], external=True)
prev_poppunk_clusters_splitted = split_merged_clusters(df_prev_db_poppunk_cluster["Cluster"], external=False)
new_poppunk_clusters_splitted = split_merged_clusters(df_new_db_poppunk_cluster["Cluster"], external=False)

prev_poppunk_clusters_min, prev_poppunk_clusters_max  = min(prev_poppunk_clusters_splitted), max(prev_poppunk_clusters_splitted)
prev_gpsc_history_min, prev_gpsc_history_max = min(prev_gpscs_history_splitted), max(prev_gpscs_history_splitted)

diff_poppunk_clusters_and_gpsc_max = prev_poppunk_clusters_max - prev_gpsc_history_max

new_poppunk_new_clusters = new_poppunk_clusters_splitted - prev_poppunk_clusters_splitted
new_poppunk_new_clusters_min, new_poppunk_new_clusters_max = min(new_poppunk_new_clusters), max(new_poppunk_new_clusters)

df_new_db_gpsc = df_new_db_gpsc.merge(df_new_db_poppunk_cluster.rename(columns={"Taxon": "sample"}), on="sample", how="left")

gpsc_unassigned_mask = df_new_db_gpsc['GPSC'] == "NA"
df_new_db_gpsc.loc[gpsc_unassigned_mask, "GPSC"] = df_new_db_gpsc.loc[gpsc_unassigned_mask, "Cluster"].astype(int) - diff_poppunk_clusters_and_gpsc_max

new_gpsc_min = new_poppunk_new_clusters_min - diff_poppunk_clusters_and_gpsc_max
new_gpsc_max = new_poppunk_new_clusters_max - diff_poppunk_clusters_and_gpsc_max

print(f"Previous database {prev_db}:")
print(f"  PopPUNK Cluster range used : {prev_poppunk_clusters_min} - {prev_poppunk_clusters_max}")
print(f"  GPSC range used: {prev_gpsc_history_min} - {prev_gpsc_history_max}")
print(f"  The value difference between largest PopPUNK Cluster and GPSC: {diff_poppunk_clusters_and_gpsc_max}")
print()
print(f"New database {new_db}:")
print(f"  Number of samples with unassigned GPSC: {gpsc_unassigned_mask.sum()}")
print(f"  New PopPUNK Cluster range used: {new_poppunk_new_clusters_min} - {new_poppunk_new_clusters_max} (Length: {new_poppunk_new_clusters_max - new_poppunk_new_clusters_min + 1})")
print(f"  New GPSC range assigned: {new_gpsc_min} - {new_gpsc_max} (Length: {new_gpsc_max - new_gpsc_min + 1})")

Previous database GPS_v9:
  PopPUNK Cluster range used : 1 - 1231
  GPSC range used: 1 - 1122
  The value difference between largest PopPUNK Cluster and GPSC: 109

New database GPS_v9_1:
  Number of samples with unassigned GPSC: 4
  New PopPUNK Cluster range used: 1232 - 1234 (Length: 3)
  New GPSC range assigned: 1123 - 1125 (Length: 3)


In [6]:
# Add merge history from previous database and assign current GPSC to those without history
df_new_db_gpsc = df_new_db_gpsc[["sample", "GPSC"]].merge(df_prev_db_gpsc[["sample", "merge_history"]], on="sample", how="left")
df_new_db_gpsc["merge_history"] = df_new_db_gpsc["merge_history"].fillna(df_new_db_gpsc["GPSC"])
df_new_db_gpsc = df_new_db_gpsc.astype(str)

# Get all merged GPSCs and ensure all intersected sets are fully joint
merged_gpscs = get_merged_gpscs(df_new_db_gpsc)

# Generate information for merged GPSCs
merged_gpscs_info_dict = get_merged_gpscs_info(merged_gpscs)

# Ensure base GPSC and correct fully joint merge history are used (235;9 is an exception)
df_new_db_gpsc = base_gpsc_merge_history_correction(df_new_db_gpsc, merged_gpscs_info_dict)

In [7]:
# Output updated GPSC assignmnet
df_new_db_gpsc.to_csv(new_db_gpsc_output_path, index=False)


# Remove oudated GPSC Designation file and reference-only database in the new database 
os.remove(new_db_gpsc_path)
for f in glob.glob(os.path.join(dbs_path, new_db, f"{new_db}.refs*")):
    os.remove(f)