In [None]:
# Read the dataframes into dataframes

import pandas as pd

df_p005_no_starcode = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p005_no_starcode/final_fragments_summary.csv")
df_p006_no_starcode = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p006_no_starcode/final_fragments_summary.csv")
df_p007_no_starcode = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p007_no_starcode/final_fragments_summary.csv")
df_p005_starcode = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p005/final_fragments_summary.csv")
df_p006_starcode = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p006/final_fragments_summary.csv")
df_p007_starcode = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p007/final_fragments_summary.csv")
df_p005_starcode_no_chimeric = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p005_starcode_no_chimeric/final_fragments_summary.csv")
df_p006_starcode_no_chimeric = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p006_starcode_no_chimeric/final_fragments_summary.csv")
df_p007_starcode_no_chimeric = pd.read_csv("/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p007_starcode_no_chimeric/final_fragments_summary.csv")

In [None]:
def plasmid_library(dataframe_dict):
    def compute_metrics(df):
        # Create sets of peptides for each group
        plasmid_library_set = set(df.loc[df["Group"] == "Plasmid_Library", "Peptide"])
        dnase_resistant_set = set(df.loc[df["Group"] == "DNAse_resistant_AAVs", "Peptide"])
        infective_set = set(df.loc[df["Group"] == "Infective_AAVs", "Peptide"])
        
        metrics = {}
        metrics["Plasmid_Library Count"] = len(plasmid_library_set)
        metrics["DNAse_resistant_AAVs Count"] = len(dnase_resistant_set)
        metrics["Infective_AAVs Count"] = len(infective_set)
        
        # DNAse_resistant_AAVs in Plasmid_Library (count and percentage)
        dnase_in_plasmid = dnase_resistant_set.intersection(plasmid_library_set)
        count_dnase_in_plasmid = len(dnase_in_plasmid)
        perc_dnase_in_plasmid = (count_dnase_in_plasmid / len(dnase_resistant_set) * 100) if dnase_resistant_set else 0
        metrics["DNAse_resistant_AAVs in Plasmid_Library"] = f"{count_dnase_in_plasmid} ({perc_dnase_in_plasmid:.2f}%)"
        
        # Infective_AAVs in Plasmid_Library (count and percentage)
        infective_in_plasmid = infective_set.intersection(plasmid_library_set)
        count_infective_in_plasmid = len(infective_in_plasmid)
        perc_infective_in_plasmid = (count_infective_in_plasmid / len(infective_set) * 100) if infective_set else 0
        metrics["Infective_AAVs in Plasmid_Library"] = f"{count_infective_in_plasmid} ({perc_infective_in_plasmid:.2f}%)"
        
        # Infective_AAVs in DNAse_resistant_AAVs (count and percentage)
        infective_in_dnase = infective_set.intersection(dnase_resistant_set)
        count_infective_in_dnase = len(infective_in_dnase)
        perc_infective_in_dnase = (count_infective_in_dnase / len(infective_set) * 100) if infective_set else 0
        metrics["Infective_AAVs in DNAse_resistant_AAVs"] = f"{count_infective_in_dnase} ({perc_infective_in_dnase:.2f}%)"
        
        return metrics

    # Dictionary to store computed metrics for each dataframe
    results = {}
    for name, df in dataframe_dict.items():
        results[name] = compute_metrics(df)
    
    # Get the list of metric names (assumed to be the same for all dataframes)
    metric_keys = list(next(iter(results.values())).keys()) if results else []
    
    # Build the comparison table with metric names as rows and each dataframe as a column
    table_data = {}
    for name, metrics in results.items():
        table_data[name] = [metrics[key] for key in metric_keys]
    
    comparison_table = pd.DataFrame(table_data, index=metric_keys)
    return comparison_table


In [None]:
# Create a dictionary of dataframes
dataframes = {
    "p005_no_starcode": df_p005_no_starcode,
    "p005_starcode": df_p005_starcode,
    "p005_starcode_no_chimeric": df_p005_starcode_no_chimeric,
    "p006_no_starcode": df_p006_no_starcode,
    "p006_starcode": df_p006_starcode,
    "df_p006_starcode_no_chimeric": df_p006_starcode_no_chimeric,
    "p007_no_starcode": df_p007_no_starcode,
    "p007_starcode": df_p007_starcode,
    "p007_starcode_no_chimeric": df_p007_starcode_no_chimeric
}
# Call the function and print the comparison table
comparison_table = plasmid_library(dataframes)
# Save the comparison table to a CSV file
comparison_table.to_csv("Starcode_comparision_table.csv")
print(comparison_table)

In [None]:
def intersection_report(df1, df2, lib1_name, lib2_name):
    # Define the groups of interest
    groups = ["Plasmid_Library", "DNAse_resistant_AAVs", "Infective_AAVs"]
    
    # Create sets of peptides for each group for both dataframes
    df1_sets = {grp: set(df1.loc[df1["Group"] == grp, "Peptide"]) for grp in groups}
    df2_sets = {grp: set(df2.loc[df2["Group"] == grp, "Peptide"]) for grp in groups}
    
    # Start building the report
    report_lines = []
    report_lines.append("=== Group Metrics Comparison ===")
    
    # Report counts and intersections for each group
    for grp in groups:
        count_df1 = len(df1_sets[grp])
        count_df2 = len(df2_sets[grp])
        inter_count = len(df1_sets[grp].intersection(df2_sets[grp]))
        
        report_lines.append(f"\nGroup: {grp}")
        report_lines.append(f"  {lib1_name} Count: {count_df1}")
        report_lines.append(f"  {lib2_name} Count: {count_df2}")
        report_lines.append(f"  Intersection Count: {inter_count}")
    
    # Analyze common Plasmid_Library sequences across both dataframes
    common_plasmid = df1_sets["Plasmid_Library"].intersection(df2_sets["Plasmid_Library"])
    report_lines.append("\n=== Common Plasmid_Library Analysis ===")
    report_lines.append(f"Common Plasmid_Library sequences: {len(common_plasmid)}")
    
    if common_plasmid:
        # For DataFrame 1: How many of the common Plasmid_Library sequences are in DNAse_resistant_AAVs and Infective_AAVs?
        common_in_dna_df1 = len(common_plasmid.intersection(df1_sets["DNAse_resistant_AAVs"]))
        common_in_infect_df1 = len(common_plasmid.intersection(df1_sets["Infective_AAVs"]))
        perc_dna_df1 = (common_in_dna_df1 / len(common_plasmid)) * 100
        perc_infect_df1 = (common_in_infect_df1 / len(common_plasmid)) * 100
        
        # For DataFrame 2: How many of the common Plasmid_Library sequences are in DNAse_resistant_AAVs and Infective_AAVs?
        common_in_dna_df2 = len(common_plasmid.intersection(df2_sets["DNAse_resistant_AAVs"]))
        common_in_infect_df2 = len(common_plasmid.intersection(df2_sets["Infective_AAVs"]))
        perc_dna_df2 = (common_in_dna_df2 / len(common_plasmid)) * 100
        perc_infect_df2 = (common_in_infect_df2 / len(common_plasmid)) * 100
        
        report_lines.append(f"\n{lib1_name} (Common Plasmid_Library sequences):")
        report_lines.append(f"  Found in DNAse_resistant_AAVs: {common_in_dna_df1} ({perc_dna_df1:.2f}%)")
        report_lines.append(f"  Found in Infective_AAVs: {common_in_infect_df1} ({perc_infect_df1:.2f}%)")
        
        report_lines.append(f"\n{lib2_name} (Common Plasmid_Library sequences):")
        report_lines.append(f"  Found in DNAse_resistant_AAVs: {common_in_dna_df2} ({perc_dna_df2:.2f}%)")
        report_lines.append(f"  Found in Infective_AAVs: {common_in_infect_df2} ({perc_infect_df2:.2f}%)")
        
    else:
        report_lines.append("No common Plasmid_Library sequences found between the two libraries.")
    
    # Combine all lines into a single formatted report
    report = "\n".join(report_lines)
    return report

In [None]:
report_p005_p006_no_starcode = intersection_report(df_p005_no_starcode, df_p006_no_starcode, "p005_no_starcode", "p006_no_starcode")
report_p005_p006_starcode = intersection_report(df_p005_starcode, df_p006_starcode, "p005_starcode", "p006_starcode")
report_p005_p006_starcode_no_chimeric = intersection_report(df_p005_starcode_no_chimeric, df_p006_starcode_no_chimeric, "p005_starcode_no_chimeric", "p006_starcode_no_chimeric")
# write the reports to text files
with open("reports/p005_p006_no_starcode_report.txt", "w") as f:
    f.write(report_p005_p006_no_starcode)
with open("reports/p005_p006_starcode_report.txt", "w") as f:
    f.write(report_p005_p006_starcode)
with open("reports/p005_p006_starcode_no_chimeric_report.txt", "w") as f:
    f.write(report_p005_p006_starcode_no_chimeric)
report_p005_p007_no_starcode = intersection_report(df_p005_no_starcode, df_p007_no_starcode, "p005_no_starcode", "p007_no_starcode")
report_p005_p007_starcode = intersection_report(df_p005_starcode, df_p007_starcode, "p005_starcode", "p007_starcode")
report_p005_p007_starcode_no_chimeric = intersection_report(df_p005_starcode_no_chimeric, df_p007_starcode_no_chimeric, "p005_starcode_no_chimeric", "p007_starcode_no_chimeric")
# write the reports to text files
with open("reports/p005_p007_no_starcode_report.txt", "w") as f:
    f.write(report_p005_p007_no_starcode)
with open("reports/p005_p007_starcode_report.txt", "w") as f:
    f.write(report_p005_p007_starcode)
with open("reports/p005_p007_starcode_no_chimeric_report.txt", "w") as f:
    f.write(report_p005_p007_starcode_no_chimeric)
report_p006_p007_no_starcode = intersection_report(df_p006_no_starcode, df_p007_no_starcode, "p006_no_starcode", "p007_no_starcode")
report_p006_p007_starcode = intersection_report(df_p006_starcode, df_p007_starcode, "p006_starcode", "p007_starcode")
report_p006_p007_starcode_no_chimeric = intersection_report(df_p006_starcode_no_chimeric, df_p007_starcode_no_chimeric, "p006_starcode_no_chimeric", "p007_starcode_no_chimeric")
# write the reports to text files
with open("reports/p006_p007_no_starcode_report.txt", "w") as f:
    f.write(report_p006_p007_no_starcode)
with open("reports/p006_p007_starcode_report.txt", "w") as f:
    f.write(report_p006_p007_starcode)
with open("reports/p006_p007_starcode_no_chimeric_report.txt", "w") as f:
    f.write(report_p006_p007_starcode_no_chimeric)

In [None]:
# create a optimized dataset from p005 and p007 where we only keep the sequences which are in both groups
def optimized_dataset(df1, df2):
    # Keep only the groups of interest
    groups_of_interest = ["Plasmid_Library", "DNAse_resistant_AAVs", "Infective_AAVs"]
    # creaete 6 dataframes with the groups of interest
    df1_groups = {grp: df1.loc[df1["Group"] == grp] for grp in groups_of_interest}
    df2_groups = {grp: df2.loc[df2["Group"] == grp] for grp in groups_of_interest}
    # create a new dataframe where you only keep the sequences which are in both groups
    optimized_df = pd.DataFrame()
    for grp in groups_of_interest:
        optimized_df = pd.concat([optimized_df, df1_groups[grp].loc[df1_groups[grp]["Peptide"].isin(df2_groups[grp]["Peptide"])]])
    # add al Peptides from df2 which are not in df1["plasmid_library"]
    optimized_df = pd.concat([optimized_df, df2_groups["Plasmid_Library"].loc[~df2_groups["Plasmid_Library"]["Peptide"].isin(df1_groups["Plasmid_Library"]["Peptide"])]])
    # add all Peptides from df2 which are not in df1["infective_aavs"]
    optimized_df = pd.concat([optimized_df, df2_groups["Infective_AAVs"].loc[~df2_groups["Infective_AAVs"]["Peptide"].isin(df1_groups["Plasmid_Library"]["Peptide"])]])
    # add all Peptides from df2 which are not in df1["dnase_resistant_aavs"]
    optimized_df = pd.concat([optimized_df, df2_groups["DNAse_resistant_AAVs"].loc[~df2_groups["DNAse_resistant_AAVs"]["Peptide"].isin(df1_groups["Plasmid_Library"]["Peptide"])]])
    # add al Peptides from df2 which are not in df1["plasmid_library"]
    optimized_df = pd.concat([optimized_df, df1_groups["Plasmid_Library"].loc[~df1_groups["Plasmid_Library"]["Peptide"].isin(df2_groups["Plasmid_Library"]["Peptide"])]])
    # add all Peptides from df2 which are not in df1["infective_aavs"]
    optimized_df = pd.concat([optimized_df, df1_groups["Infective_AAVs"].loc[~df1_groups["Infective_AAVs"]["Peptide"].isin(df2_groups["Plasmid_Library"]["Peptide"])]])
    # add all Peptides from df2 which are not in df1["dnase_resistant_aavs"]
    optimized_df = pd.concat([optimized_df, df1_groups["DNAse_resistant_AAVs"].loc[~df1_groups["DNAse_resistant_AAVs"]["Peptide"].isin(df2_groups["Plasmid_Library"]["Peptide"])]])
    # reset the index
    optimized_df.reset_index(drop=True, inplace=True)
    return optimized_df

In [None]:
optimized_dataset_p005_p007_no_starcode = optimized_dataset(df_p005_no_starcode, df_p007_no_starcode)
# print how many sequences are in each group
print("=== Optimized Dataset p005 and p007 no starcode ===")
print("Plasmid_Library: ", len(optimized_dataset_p005_p007_no_starcode.loc[optimized_dataset_p005_p007_no_starcode["Group"] == "Plasmid_Library"]))
print("DNAse_resistant_AAVs: ", len(optimized_dataset_p005_p007_no_starcode.loc[optimized_dataset_p005_p007_no_starcode["Group"] == "DNAse_resistant_AAVs"]))
print("Infective_AAVs: ", len(optimized_dataset_p005_p007_no_starcode.loc[optimized_dataset_p005_p007_no_starcode["Group"] == "Infective_AAVs"]))

# save the optimized dataset to a csv file
optimized_dataset_p005_p007_no_starcode.to_csv("optimized_dataset_p005_p007_no_starcode.csv", index=False)