In [7]:
import pandas as pd
import re
from scipy.stats import ttest_ind

# Path to the CSV file
file_path = 'gRNA_Ranks.csv'  # Update this path if the file is in a different location

# Import the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Function to extract all PP_#### from a string
def extract_pp_tags(entry):
    return re.findall(r"PP_\d{4}", entry)

# Create a new column with the extracted tags
df["tags"] = df["strain"].apply(extract_pp_tags)

# Flatten the tags and create a list of all unique PP_####
unique_tags = set(tag for tags in df["tags"] for tag in tags)

# Initialize a results list
summary_results = []

# Loop through each unique PP_####
for tag in unique_tags:
    # Strains containing the current tag
    with_tag = df[df["strain"].str.contains(tag)]
    # Strains without the current tag
    without_tag = df[~df["strain"].str.contains(tag)]
    
    # Calculate average values
    avg_with_tag = with_tag["value"].mean()
    avg_without_tag = without_tag["value"].mean()
    
    # Perform a t-test
    if len(with_tag) > 1 and len(without_tag) > 1:  # Ensure enough data for t-test
        t_stat, p_value = ttest_ind(with_tag["value"], without_tag["value"], equal_var=False)
    else:
        t_stat, p_value = None, None  # Not enough data for t-test
    
    # Count unique strains with the tag
    unique_with_tag = with_tag["strain"].nunique()

    # Store results
    summary_results.append({
        "target_tag": tag,
        "avg_with_tag": avg_with_tag,
        "avg_without_tag": avg_without_tag,
        "t_stat": t_stat,
        "p_value": p_value,
        "rank": None,  # Placeholder for rank
        "unique_with_tag": unique_with_tag,  # Number of unique strains with the tag
    })

# Convert results to DataFrame
summary_df = pd.DataFrame(summary_results)

# Rank by average value of strains with the tag
summary_df["rank"] = summary_df["avg_with_tag"].rank(method="dense", ascending=False)

# Sort by rank
summary_df = summary_df.sort_values(by="rank")

# Save the results to a CSV file
summary_df.to_csv('gRNA_ranked_isoprenol.csv', index=False)

# Display the results
print(summary_df)


    target_tag  avg_with_tag  avg_without_tag     t_stat       p_value   rank  \
54     PP_0528    394.401420       259.879227  12.927764  1.980227e-28    1.0   
12     PP_0812    392.545770       257.695592  12.597798  4.024003e-28    2.0   
32     PP_5419    391.454377       273.847493   9.296007  6.335998e-03    3.0   
107    PP_0815    377.914343       239.654523  13.969505  3.323359e-37    4.0   
89     PP_0813    373.979969       258.695150  10.614463  4.706535e-22    5.0   
..         ...           ...              ...        ...           ...    ...   
62     PP_4012     34.489480       274.559997 -40.110640  1.610807e-07  117.0   
50     PP_4119     18.390887       274.592130 -25.895967  2.731139e-04  118.0   
95     PP_5064     10.573200       274.607734 -23.630578  5.230762e-04  119.0   
86     PP_1777      0.010000       274.628819 -75.423661  0.000000e+00  120.0   
120    PP_3394      0.010000       274.628819 -75.423661  0.000000e+00  120.0   

     unique_with_tag  
54  

  res = hypotest_fun_out(*samples, **kwds)
