In [8]:
%load_ext autoreload
%autoreload 2

import logging

import numpy as np
import pandas as pd

logging.basicConfig(level=logging.INFO)

report_path = "../data/report.tsv"

# read the report and make a scrambled version
rdf = pd.read_csv(report_path, sep="\t")

# Get rid of critical columns
rdf = rdf.drop(columns=["File.Name"])

# We need the original runs once for mapping
unique_runs = rdf["Run"].unique()

# Replace the "Run" column wtih random sample numbers
replacement_samples = {original: f"sample_{i}" for i, original in enumerate(unique_runs)}
rdf["Run"] = rdf["Run"].replace(replacement_samples)

# Now the runs are masked
unique_runs = rdf["Run"].unique()

# Sample metadata
sample_metadata = pd.DataFrame(
    {
        "Run": rdf["Run"].unique(),
    }
)
# randomly assign treatment or control
sample_metadata["treatment"] = np.random.choice(["treatment", "control"], size=len(sample_metadata))
sample_metadata["celltype"] = np.random.choice(["Round", "Oblique", "Diffuse", "Elongated"], size=len(sample_metadata))
sample_metadata["batch"] = np.random.choice(["A", "B"], size=len(sample_metadata))

# Feature metadata
feature_metadata = pd.DataFrame(
    {
        "Protein.Group": rdf["Protein.Group"].unique(),
    }
)
pg_genes = rdf[["Protein.Group", "Genes"]].drop_duplicates()
pg2gene = dict(zip(pg_genes["Protein.Group"], pg_genes["Genes"], strict=False))
feature_metadata["Genes"] = feature_metadata["Protein.Group"].map(pg2gene)

# Save the scrambled report, feature metadata, and sample metadata
rdf.to_csv(report_path.replace(".tsv", "_random_scrambled_data.tsv"), sep="\t", index=False)
sample_metadata.to_csv(report_path.replace(".tsv", "_random_scrambled_sample_metadata.tsv"), sep="\t", index=False)
feature_metadata.to_csv(report_path.replace(".tsv", "_random_scrambled_feature_metadata.tsv"), sep="\t", index=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
