# Import Required Libraries
Import pandas and other necessary libraries for data analysis.

In [None]:
# Import Required Libraries
import pandas as pd

# Load the CSV File
Use pandas to load the CSV file into a DataFrame.

In [None]:
# Load the TSV File
# Replace 'your_file_path.tsv' with the actual path to your TSV file
df = pd.read_csv('data_2.tsv', sep='\t')

# Preview the Data
Display the first 5 rows of the DataFrame using the head() method.

In [None]:
# Preview the Data
df.head()
df.columns

# Understand the Data Structure
Use methods like info(), describe(), and columns to get an overview of the data types, column names, and summary statistics.

In [None]:
# Understand the Data Structure
df.info()
df.describe()
df.columns

In [None]:
df.describe(include='all')

In [None]:
df.columns[:50]

# Explore Dataset Size and Structure
Let's analyze the dataset to answer:
- How many unique drugs are present?
- How many unique peptides are there?
- How many different concentrations were tested?
- Additional summary statistics relevant to the experiment design.

In [None]:
# Remove columns containing 'Unmodified sequence' and report stats
cols_before = df.shape[1]
unmod_cols = [col for col in df.columns if '_unmod' in col]
if unmod_cols:
    start_idx = df.columns.get_loc(unmod_cols[0])
    end_idx = df.columns.get_loc(unmod_cols[-1])
else:
    start_idx = end_idx = None
num_unmod = len(unmod_cols)
df = df.drop(columns=unmod_cols)
cols_after = df.shape[1]
print(f'1) Number of columns before: {cols_before}')
if start_idx is not None and end_idx is not None:
    print(f'2) Columns with \"Unmodified sequence\" start at index {start_idx} and end at index {end_idx}')
else:
    print('2) No columns with \"Unmodified sequence\" found.')
print(f'3) Number of columns with \"Unmodified sequence\": {num_unmod}')
print(f'4) Number of columns after removing them: {cols_after}')
print(f'5) First 30 columns after removal: {list(df.columns[:50])}')
print(f'   Last 30 columns after removal: {list(df.columns[-50:])}')

In [None]:
[col for col in df.columns if 'CAKI' in col]

In [None]:
columns_to_not_consider = ['rowid', 'ccms_row_id', 'Variant', 'Variant ID', 'Unmod variant', 'Total', 'Proteins', 'Mass', 'Charge', 'Num Mods', 'All Mods', 'Is Decoy', 'Lorikeet input', 'Orig cluster FDR', 'Pep Prefix', 'Variant FDR', 'Peptidoform', 'Canonical proteins', 'Top protein', 'Top canonical protein', 'Top protein FDR', 'Top canonical protein FDR', 'Rep cluster task', 'Rep cluster user', 'Rep cluster index', 'Num tasks', 'Outlier groups', 'Outlier group ratio', 'Outlier groups- unmod', 'Outlier group ratio- unmod', 'Unmod_Variant',]

In [None]:
# Helper functions to parse column names for drugs, concentrations, and cell lines
import re

def extract_drug_conc_cell(col):
    if not col.startswith('_dyn_'):
        return None, None, None

    # Remove prefix and suffix
    base = col.replace('_dyn_#', '').split('.Tech')[0].strip()

    # Try to extract cell line (old: -inCELL/-withCELL, new: withCELL as a word)
    cell_line = None

    # Old style: -inCELL or -withCELL
    m = re.search(r'-(?:in|with)([A-Za-z0-9]+)', base)
    if m:
        cell_line = m.group(1)
        base = re.sub(r'-(?:in|with)[A-Za-z0-9]+', '', base)
    else:
        # New style: withCELL (no dash, after drug name)
        m2 = re.search(r'with([A-Za-z0-9]+)', base)
        if m2:
            cell_line = m2.group(1)
            base = re.sub(r'with[A-Za-z0-9]+', '', base)

    # Split by last space or dash to separate drug and concentration
    if ' ' in base:
        parts = base.rsplit(' ', 1)
    else:
        parts = base.rsplit('-', 1)

    if len(parts) == 2:
        drug, conc = parts
    else:
        drug, conc = parts[0], None

    return drug.strip(), conc.strip() if conc else None, cell_line

# Get all experiment columns (exclude metadata and summary columns)
exp_cols = [col for col in df.columns if col not in columns_to_not_consider]

print(f'Number of experiment columns: {len(exp_cols)}')


drugs_concs_with_cell_lines = {}
for col in exp_cols:
    drug, conc, cell_line = extract_drug_conc_cell(col)
    if drug and conc and cell_line:
        drugs_concs_with_cell_lines[col] = (drug, conc, cell_line)

print(f'Number of unique drug-concentration-cell line combinations: {len(drugs_concs_with_cell_lines)}')

caki_col = [col for col in exp_cols if 'CAKI' in col]
print(f'Number of Caki columns: {len(caki_col)}')

bt474_col = [col for col in exp_cols if 'BT474' in col]
print(f'Number of BT474 columns: {len(bt474_col)}')

# Extract unique drugs, concentrations, and cell lines
drugs = set()
concs = set()
cell_lines = set()

drugs_concs = set()

for col in exp_cols:
    drug, conc, cell_line = extract_drug_conc_cell(col)
    # print(f'Column: {col}, Drug: {drug}, Concentration: {conc}, Cell Line: {cell_line}')
    if drug:
        drugs.add(drug)
    if conc:
        concs.add(conc)
    if cell_line:
        cell_lines.add(cell_line)

    if drug and conc:
        if (drug, conc) in drugs_concs:
            print(f'Observed drug-concentration pair again: {drug}, {conc}')
            if cell_line:
                print(f'It is observed with Cell line: {cell_line}')
            else:
                print(f'It is observed without Cell line: {cell_line}')
        drugs_concs.add((drug, conc))


print(f'Number of unique drugs: {len(drugs)}')
print(f'Number of unique concentrations: {len(concs)}')
print(f'Number of unique cell lines: {len(cell_lines)}')
print(f'Example drugs: {list(drugs)[:5]}')
print(f'Example concentrations: {sorted(concs)}')
print(f'Example cell lines: {sorted(cell_lines)}')

In [None]:
df["Variants- Unmodified sequence"]

In [None]:
exp_cols

In [None]:
# Number of unique peptides (using the Unmod variant column)
num_unique_peptides = df['Unmod variant'].nunique()
print(f'Number of unique peptides: {num_unique_peptides}')

In [None]:
# Additional: Number of variants, and quick NA analysis
num_variants = df['Variant'].nunique()
na_counts = df.isna().sum().sort_values(ascending=False).head(10)
print(f'Number of unique variants: {num_variants}')
print('Columns with most NAs:')
print(na_counts)

## Notes on Experimental Design
- DMSO columns are controls and are excluded from drug/concentration counts.
- PDPD columns are unimportant and also excluded.
- Each drug is tested at multiple concentrations (typically 8 + 1 control).
- Peptide fragmentation may vary, so some columns may have N/A values.

In [None]:
x = [col for col in df.columns if 'AMG-208' in col]
print(len(x))

x

# Plot Concentration Response for a Random Peptide Variant of a Selected Drug (Excluding CAKI/BT474)
This section selects a drug that does not have CAKI or BT474 cell lines, picks a random peptide variant, and plots the response across increasing concentrations using Plotly.

In [None]:
columns_to_check = [col for col in df.columns if 'AEE-788' in col]

df[(df['Variant'] == '.ESESTAGSFSLSVR.')]["_dyn_#AEE-788_inBT474 PDPD.Tech replicate 1 of 1"].mean()

In [None]:
import random
import plotly.graph_objs as go

# 1. Select drugs that do NOT have CAKI or BT474 cell lines
def extract_drug_conc_cell(col):
    import re
    if not col.startswith('_dyn_'):
        return None, None, None

    # Remove prefix and suffix
    base = col.replace('_dyn_#', '').split('.Tech')[0].strip()

    # Try to extract cell line (old: -inCELL/-withCELL, new: withCELL as a word)
    cell_line = None

    # Old style: -inCELL or -withCELL
    m = re.search(r'-(?:in|with)([A-Za-z0-9]+)', base)
    if m:
        cell_line = m.group(1)
        base = re.sub(r'-(?:in|with)[A-Za-z0-9]+', '', base)
    else:
        # New style: withCELL (no dash, after drug name)
        m2 = re.search(r'with([A-Za-z0-9]+)', base)
        if m2:
            cell_line = m2.group(1)
            base = re.sub(r'with[A-Za-z0-9]+', '', base)

    # Split by last space or dash to separate drug and concentration
    if ' ' in base:
        parts = base.rsplit(' ', 1)
    else:
        parts = base.rsplit('-', 1)

    if len(parts) == 2:
        drug, conc = parts
    else:
        drug, conc = parts[0], None

    return drug.strip(), conc.strip() if conc else None, cell_line

# Get all experiment columns (exclude metadata and summary columns)
exp_cols = [col for col in df.columns if col not in columns_to_not_consider]

# Find drugs with NO CAKI or BT474 columns
drug_to_cols = {}
for col in exp_cols:
    drug, conc, cell_line = extract_drug_conc_cell(col)
    if cell_line in ['CAKI', 'BT474']:
        continue
    if drug and conc:
        drug_to_cols.setdefault(drug, []).append((col, conc))

# Only keep drugs that have at least 2 concentrations (for plotting)
filtered_drugs = [drug for drug, lst in drug_to_cols.items() if len(lst) > 1]
if not filtered_drugs:
    raise ValueError('No drugs found without CAKI/BT474 cell lines and with >1 concentration.')

In [None]:
filtered_drugs

In [None]:
# Select a random drug

selected_drug = random.choice(filtered_drugs)
# selected_drug = 'AZD-5363'
drug_cols = drug_to_cols[selected_drug]

# Sort columns by concentration numerically (include DMSO, exclude PDPD)
def conc_sort_key(x):
    conc = x[1]
    if conc == 'DMSO':
        return -2
    elif conc == 'PDPD':
        return float('inf')
    elif conc.endswith('nM') and conc[:-2].isdigit():
        return float(conc[:-2])
    elif conc.isdigit():
        return float(conc)
    else:
        return float('inf')

# Include DMSO, exclude PDPD
drug_cols = [x for x in drug_cols if x[1] != 'PDPD']
drug_cols_sorted = sorted(drug_cols, key=conc_sort_key)
col_names_sorted = [x[0] for x in drug_cols_sorted]
concs_sorted = [x[1] for x in drug_cols_sorted]

# 2. Select a random peptide variant

peptide_variants = df.index

print(peptide_variants)
print(f'Number of peptide variants: {len(peptide_variants)}')

import numpy as np
def safe_float(x):
    if isinstance(x, str):
        x = x.replace(',', '')
    try:
        return float(x)
    except:
        return np.nan

# Try up to 20 random peptides to find one with at least one non-nan value in the 9 concentrations
for _ in range(20):
    selected_peptide = random.choice(peptide_variants)
    row = df.loc[[selected_peptide]]
    yvals_raw = row[col_names_sorted].values.flatten()
    yvals = np.array([safe_float(x) for x in yvals_raw])
    # Exclude DMSO (assume it's first if present)
    non_dmso_indices = [i for i, c in enumerate(concs_sorted) if c != 'DMSO']
    yvals_non_dmso = yvals[non_dmso_indices] if non_dmso_indices else yvals
    if not np.all(np.isnan(yvals_non_dmso)):
        break
else:
    raise ValueError('No peptide found with at least one non-nan value for the 9 concentrations.')

print(yvals)

# Find DMSO response value
try:
    dmso_idx = concs_sorted.index('DMSO')
    dmso_response = yvals[dmso_idx]
except ValueError:
    dmso_response = None

# 4. Plot using plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=concs_sorted, y=yvals, mode='lines+markers', name=f'{selected_drug}'))

# Add horizontal line at DMSO response
if dmso_response is not None:
    fig.add_hline(y=dmso_response, line_color='red', line_dash='dash', 
                  annotation_text='DMSO level', annotation_position='top left')

fig.update_layout(title=f'Concentration Response for {selected_drug} (Peptide: {row["Variant"].values[0]})',
                  xaxis_title='Concentration', yaxis_title='Response',
                  xaxis_type='category')
fig.show()

print(f'Selected drug: {selected_drug}')
print(f'Selected peptide: {row["Variant"]}')

In [None]:
filtered_drugs

In [None]:
import numpy as np
from tqdm import tqdm

x = filtered_drugs[:2]

# Compute and store variation for all drug-peptide combinations (excluding PDPD, using DMSO + 9 concentrations)
# Assumes: filtered_drugs, drug_to_cols, df are already defined
# Result: variations dict, zero_variation_combinations list
variations = {}  # (drug, peptide) -> variation value
zero_variation_combinations = []

# Sort by concentration (DMSO first, then numerically)
def conc_sort_key(x):
    conc = x[1]
    if conc == 'DMSO':
        return -2
    elif conc.endswith('nM') and conc[:-2].isdigit():
        return float(conc[:-2])
    elif conc.isdigit():
        return float(conc)
    else:
        return float('inf')

for drug in x:
    drug_cols = drug_to_cols[drug]
    # Exclude PDPD, keep DMSO and 9 concentrations
    drug_cols = [x for x in drug_cols if x[1] != 'PDPD']
    drug_cols_sorted = sorted(drug_cols, key=conc_sort_key)
    col_names_sorted = [x[0] for x in drug_cols_sorted]
    # Only keep if we have at least 10 (DMSO + 9 concentrations)
    for peptide in tqdm(df.index, desc=f'Peptides for {drug}', leave=False):
        yvals = df.loc[peptide, col_names_sorted].values.astype(float)
        # Compute variation (standard deviation)
        variation = np.nanstd(yvals)
        variations[(drug, peptide)] = variation
        if variation == 0 or np.isnan(variation):
            zero_variation_combinations.append((drug, peptide))

average_variation = np.nanmean(list(variations.values()))

print(f'Total drug-peptide combinations: {len(variations)}')
print(f'Number of combinations with zero variation: {len(zero_variation_combinations)}')

In [None]:
selected_drug

In [None]:
# Select a random drug
selected_drug = 'AMG-900'
drug_cols = drug_to_cols[selected_drug]

# Sort columns by concentration numerically (include DMSO, exclude PDPD)
def conc_sort_key(x):
    conc = x[1]
    if conc == 'DMSO':
        return -2
    elif conc == 'PDPD':
        return float('inf')
    elif conc.endswith('nM') and conc[:-2].isdigit():
        return float(conc[:-2])
    elif conc.isdigit():
        return float(conc)
    else:
        return float('inf')

# Include DMSO, exclude PDPD
drug_cols = [x for x in drug_cols if x[1] != 'PDPD']
drug_cols_sorted = sorted(drug_cols, key=conc_sort_key)
col_names_sorted = [x[0] for x in drug_cols_sorted]
concs_sorted = [x[1] for x in drug_cols_sorted]

# 2. Select a random peptide variant

peptide_variants = df.index

print(peptide_variants)
print(f'Number of peptide variants: {len(peptide_variants)}')
selected_peptide = random.choice(peptide_variants)


selected_peptide = df.loc[df['Variant'] == '.AGNILLNTEGHAK.'].index[0]

print(selected_peptide)

# 3. Extract values for this peptide across concentrations
row = df.loc[[selected_peptide]]
yvals = row[col_names_sorted].values.flatten()

# Find DMSO response value
try:
    dmso_idx = concs_sorted.index('DMSO')
    dmso_response = yvals[dmso_idx]
except ValueError:
    dmso_response = None

# 4. Plot using plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=concs_sorted, y=yvals, mode='lines+markers', name=f'{selected_drug}'))

# Add horizontal line at DMSO response
if dmso_response is not None:
    fig.add_hline(y=dmso_response, line_color='red', line_dash='dash', 
                  annotation_text='DMSO level', annotation_position='top left')

fig.update_layout(title=f'Concentration Response for {selected_drug} (Peptide: {row["Variant"].values[0]})',
                  xaxis_title='Concentration', yaxis_title='Response',
                  xaxis_type='category')
fig.show()

print(f'Selected drug: {selected_drug}')
print(f'Selected peptide: {row["Variant"]}')