In [1]:
import os

# ========== STEP 0: Set working directory (for running locally on laptop) =========
os.getcwd()
os.chdir("/Users/adeslatt/Scitechcon Dropbox/Anne DeslattesMays/projects/oadr-autoantibody")
os.getcwd()

'/Users/adeslatt/Scitechcon Dropbox/Anne DeslattesMays/projects/oadr-autoantibody'

In [2]:
# We'll extract:
# Baseline 4 Hour AUC or a variation like 4 Hour Baseline AUC (depending on the dataset),
# At visit = -1, or equivalent for baseline (you might need to verify that per file).
#
# Note on SDY569
# SDY569 (ITN007AI) doesn’t have an explicit Baseline 4 Hour AUC, but it has:
# BASELINE TOTAL C-PEP AUC which is equivalent (but units may differ).
# So for SDY569, we’ll use:
# BASELINE TOTAL C-PEP AUC

In [3]:
import pandas as pd

# Load the file
df = pd.read_csv("data/raw/SDY569_ITN007AI_LABDATA_2015-03-04_11-26-17.csv")

# Inspect raw 'visitnum' column
print("Raw 'visitnum' unique values:")
print(df["visitnum"].unique())

# Try converting to numeric
df["visitnum_numeric"] = pd.to_numeric(df["visitnum"], errors="coerce")

# Show value counts after conversion
print("\nValue counts for numeric 'visitnum':")
print(df["visitnum_numeric"].value_counts(dropna=False).sort_index())

# Check how many are considered baseline (0 or -1)
is_baseline = df["visitnum_numeric"].isin([0, -1])
print(f"\nNumber of baseline visits (visitnum = 0 or -1): {is_baseline.sum()}")


Raw 'visitnum' unique values:
[19 21 23 25 26 28 29  0 27 -1]

Value counts for numeric 'visitnum':
visitnum_numeric
-1      22
 0     198
 19    220
 21    220
 23    198
 25    198
 26    110
 27     88
 28     66
 29     66
Name: count, dtype: int64

Number of baseline visits (visitnum = 0 or -1): 220


In [4]:
import pandas as pd

# Load file
df = pd.read_csv("data/raw/SDY569_ITN007AI_LABDATA_2015-03-04_11-26-17.csv")

# Clean Sequence Num: extract decimal part and check for 0 or missing
def is_baseline_sequence(seq_val):
    try:
        parts = str(seq_val).split('.')
        if len(parts) == 1:
            return True  # no decimal → treat as baseline
        decimal_part = parts[1].strip()
        if decimal_part == '':
            return True  # blank after decimal → treat as baseline
        return int(decimal_part) == 0
    except:
        return False  # if parsing fails, not baseline

df["Is_Baseline_Sequence"] = df["Sequence Num"].apply(is_baseline_sequence)

# Show value counts
print("Baseline flag based on Sequence Num:")
print(df["Is_Baseline_Sequence"].value_counts())


Baseline flag based on Sequence Num:
Is_Baseline_Sequence
False    1323
True       63
Name: count, dtype: int64


In [5]:
import pandas as pd
import os

# Load file
df = pd.read_csv("data/raw/SDY569_ITN007AI_LABDATA_2015-03-04_11-26-17.csv")

# --- Clean Sequence Num ---
def is_baseline_sequence(seq_val):
    try:
        parts = str(seq_val).split('.')
        if len(parts) == 1 or parts[1].strip() == '':
            return True
        return int(parts[1]) == 0
    except:
        return False

df["Is_Baseline_Sequence"] = df["Sequence Num"].apply(is_baseline_sequence)

# --- Clean Visit Num ---
def is_baseline_visit(v):
    try:
        v = str(v).strip()
        return v in ["-1", "-01", "0", "00"]
    except:
        return False

df["Is_Baseline_Visit"] = df["visitnum"].apply(is_baseline_visit)

# --- Apply both filters ---
df_baseline = df[df["Is_Baseline_Sequence"] & df["Is_Baseline_Visit"]]

# --- Extract columns ---
auc_col = "Total C-PEPTIDE AUC 0-240 MINUTES"
unit_col = "UNITS"

if auc_col in df_baseline.columns and unit_col in df_baseline.columns:
    display_df = df_baseline[["ImmPort Accession", "Participant ID", auc_col, unit_col]].copy()

    # Clean AUC: ensure numeric
    display_df[auc_col] = pd.to_numeric(display_df[auc_col], errors="coerce")

    # Drop rows with missing AUC values
    display_df = display_df.dropna(subset=[auc_col])

    print(f"Final AUC rows: {len(display_df)}")
    print(display_df)

    print("\nUnit counts:")
    print(display_df[unit_col].value_counts(dropna=False))

    # Rename and standardize columns
    display_df = display_df.rename(columns={
        "ImmPort Accession": "Subject_ID",
        "Participant ID": "Participant_ID",
        auc_col: "C_Peptide_AUC_4Hrs",
        unit_col: "Units"
    })
    display_df.insert(0, "Study", "SDY569")
    display_df.insert(3, "Visit_Label", "Baseline")

    # Final output order
    display_df = display_df[[
        "Study",
        "Subject_ID",
        "Participant_ID",
        "Visit_Label",
        "C_Peptide_AUC_4Hrs",
        "Units"
    ]]

    output_path = "data/SDY569_cpeptide_auc_tidy.csv"
    display_df.reset_index(drop=True).to_csv(output_path, index=False)
    print(f"Saved to: {output_path}")
else:
    print(f"Missing column: {auc_col} or {unit_col}")


Final AUC rows: 10
     ImmPort Accession   Participant ID  Total C-PEPTIDE AUC 0-240 MINUTES  \
154          SUB151307  ITN007AI_195962                             0.5625   
352          SUB151308  ITN007AI_260733                             0.7156   
462          SUB151309  ITN007AI_303504                             0.2875   
572          SUB151310  ITN007AI_354273                             0.4052   
770          SUB151312  ITN007AI_415808                             1.5563   
880          SUB151313  ITN007AI_417544                             0.7344   
990          SUB151314  ITN007AI_552163                             0.6979   
1144         SUB151315  ITN007AI_709370                             1.2865   
1210         SUB151316  ITN007AI_902009                             0.1521   
1364         SUB151317  ITN007AI_960474                             0.4854   

      UNITS  
154   NG/ML  
352   NG/ML  
462   NG/ML  
572   NG/ML  
770   NG/ML  
880   NG/ML  
990   NG/ML  
1144  NG/M

In [6]:
# Okay now looking at data for SDY524 -- the measurements are different -- and we will convert them to ng/mL  
# Human C-peptide consists of 31 amino acids: EAEDLQVGQVELGGGPGAGSLQPLALEGSLQ
#
# The mass of this peptide is 3020 Daltons
#
# 1 Dalton = 1/12 mass of a carbon-12 atom≈1.660539×10−24grams
#
# It is equivalent to 1 atomic mass unit (amu).
# 1 Da = 1 g/mol when discussing molecular weights.
#
# The C-peptide \is a synthetic peptide with UniProt accession UPI000002FIBF
#
#
# Conversion formula is:
# 
#      ng/mL = nmol/L × molecular weight (g/mol) ÷ 1000
#            = nmol/L * 3020 g/mol ÷ 1000
#            = 3.02 ng / mL
#
# -- 2026 jan 7 -- have to pause on this because i dont have the subject connection yet


In [7]:
# SDY1737
import os

# ========== STEP 0: Set working directory (for running locally on laptop) =========
os.getcwd()
os.chdir("/Users/adeslatt/Scitechcon Dropbox/Anne DeslattesMays/projects/oadr-autoantibody")
os.getcwd()

import pandas as pd

# --- Load file ---
file_path = "data/raw/SDY1737_ADCPEP2m_2021-02-16_13-39-29_ITN041AI.csv"
df = pd.read_csv(file_path)

# --- Columns of interest ---
subject_col = "User Defined ID"
secondary_id_col = "Accession"
visit_col = "Visit"
auc_col = "Baseline 4 Hour AUC"

# --- Print first few rows for inspection ---
print("HEAD:")
print(df[[subject_col, secondary_id_col, visit_col, auc_col]].head(10))

# --- Check unique visit numbers ---
print("\nUnique Visit Numbers:")
print(df[visit_col].dropna().unique())

# --- Filter step 1: baseline visits ---
baseline_visits = ["Visit -1 (Week -1)"]
df["Visit Number Str"] = df[visit_col].astype(str).str.strip()
df["Is_Baseline_Visit"] = df["Visit Number Str"].isin([str(v) for v in baseline_visits])

print("\nBaseline visit filtering result:")
print(df["Is_Baseline_Visit"].value_counts())

# --- Show rows flagged as baseline ---
print("\nRows flagged as baseline:")
print(df[df["Is_Baseline_Visit"]][[subject_col, secondary_id_col, visit_col, auc_col]].head(20))


HEAD:
  User Defined ID  Accession                Visit  Baseline 4 Hour AUC
0   RETAIN_135342  SUB228868   Visit -1 (Week -1)             0.942366
1   RETAIN_135342  SUB228868    Visit 14 (Week 8)             0.942366
2   RETAIN_135342  SUB228868   Visit 15 (Week 19)             0.942366
3   RETAIN_135342  SUB228868   Visit 17 (Week 52)             0.942366
4   RETAIN_135342  SUB228868  Visit 21 (Week 104)             0.942366
5   RETAIN_143437  SUB228869   Visit -1 (Week -1)             0.734571
6   RETAIN_143437  SUB228869    Visit 14 (Week 8)             0.734571
7   RETAIN_143437  SUB228869   Visit 15 (Week 19)             0.734571
8   RETAIN_143437  SUB228869   Visit 17 (Week 52)             0.734571
9   RETAIN_143437  SUB228869  Visit 21 (Week 104)             0.734571

Unique Visit Numbers:
['Visit -1 (Week -1)' 'Visit 14 (Week 8)' 'Visit 15 (Week 19)'
 'Visit 17 (Week 52)' 'Visit 21 (Week 104)']

Baseline visit filtering result:
Is_Baseline_Visit
False    59
True     16
Name: 

In [10]:
import pandas as pd
import os

# Load file
file_path = "data/raw/SDY797_ADCPEP2_2019-03-01_10-25-10_ITN045AI.csv"
df = pd.read_csv(file_path)

# --- Define columns ---
subject_col = "ImmPort Accession"
participant_col = "participantId"
visit_col = "Visit"
auc_col = "4 Hour AUC"

# --- Filter for baseline visit ---
df["Visit Clean"] = df[visit_col].astype(str).str.strip()
df["Is_Baseline_Visit"] = df["Visit Clean"] == "-1"

# --- Extract relevant rows ---
output_df = df[df["Is_Baseline_Visit"]][[subject_col, participant_col, visit_col, auc_col]].copy()

# Clean AUC values
output_df[auc_col] = pd.to_numeric(output_df[auc_col], errors="coerce")
output_df = output_df.dropna(subset=[auc_col])

# Rename columns to standard
output_df = output_df.rename(columns={
    subject_col: "Subject_ID",
    participant_col: "Participant_ID",
    visit_col: "Visit_Label",
    auc_col: "C_Peptide_AUC_4Hrs"
})

# Insert standard columns
output_df.insert(0, "Study", "SDY797")
output_df["Units"] = "NG/ML"  # Assumed consistent with SDY569

# Reorder
output_df = output_df[[
    "Study",
    "Subject_ID",
    "Participant_ID",
    "Visit_Label",
    "C_Peptide_AUC_4Hrs",
    "Units"
]]

# Save
output_path = "data/SDY797_cpeptide_auc_tidy.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
output_df.reset_index(drop=True).to_csv(output_path, index=False)

print(f"[SDY797] Saved tidy baseline AUC values to: {output_path}")
print(f"[SDY797] Final N: {len(output_df)}")
print(output_df.head())


[SDY797] Saved tidy baseline AUC values to: data/SDY797_cpeptide_auc_tidy.csv
[SDY797] Final N: 49
     Study Subject_ID Participant_ID Visit_Label  C_Peptide_AUC_4Hrs  Units
0   SDY797  SUB168890   T1DAL_137962          -1            0.860031  NG/ML
9   SDY797  SUB168891   T1DAL_142037          -1            0.771851  NG/ML
10  SDY797  SUB168892   T1DAL_161919          -1            0.523187  NG/ML
19  SDY797  SUB168893   T1DAL_185333          -1            0.713202  NG/ML
20  SDY797  SUB168894   T1DAL_243767          -1            0.304106  NG/ML


In [11]:
# SDY797
import pandas as pd

# Load the CSV file
file_path = "data/raw/SDY797_ADCPEP2_2019-03-01_10-25-10_ITN045AI.csv"
df = pd.read_csv(file_path)

# --- Define key columns ---
participant_col = "participantId"
subject_col = "ImmPort Accession"
visit_col = "Visit"
AUC_4hr = "4 Hour AUC"
baseline_AUC_4hr = "4 Hour Baseline AUC"
peak_2hr_cpeptide = "2 Hour Peak C-peptide For the Visit"
baseline_peak_2hr_cpeptide= "2 Hour Baseline Peak C-peptide"
peak_4hr_cpeptide = "4 Hour Peak C-peptide For the Visit"
baseline_peak_4hr_cpeptide="4 Hour Baseline Peak C-peptide"
number_timepoints_2hr_auc="Number of Time Points Used in 2 Hr AUC"

# --- Inspect top rows ---
print("HEAD:")
print(df[[subject_col, participant_col, visit_col]].head(10))

# --- Check unique visit values ---
print("\nUnique Visit values:")
print(df[visit_col].dropna().unique())

# --- Filter for baseline visit ---
baseline_visit_value = "-1"
df["Visit Clean"] = df[visit_col].astype(str).str.strip()
df["Is_Baseline_Visit"] = df["Visit Clean"] == baseline_visit_value

print("\nBaseline visit filtering result:")
print(df["Is_Baseline_Visit"].value_counts())

# --- Show flagged baseline rows ---
print("\nBaseline rows:")
print(df[df["Is_Baseline_Visit"]][[
    participant_col, 
    subject_col, 
    visit_col, 
    participant_col,
    AUC_4hr,
    baseline_AUC_4hr,
    peak_2hr_cpeptide,
    baseline_peak_2hr_cpeptide,
    peak_4hr_cpeptide,
    baseline_peak_4hr_cpeptide,
    number_timepoints_2hr_auc
]])


HEAD:
  ImmPort Accession participantId     Visit
0         SUB168890  T1DAL_137962        -1
1         SUB168890  T1DAL_137962   Week 24
2         SUB168890  T1DAL_137962   Week 52
3         SUB168890  T1DAL_137962   Week 78
4         SUB168890  T1DAL_137962  Week 104
5         SUB168891  T1DAL_142037  Week 104
6         SUB168891  T1DAL_142037   Week 78
7         SUB168891  T1DAL_142037   Week 52
8         SUB168891  T1DAL_142037   Week 24
9         SUB168891  T1DAL_142037        -1

Unique Visit values:
['-1' 'Week 24' 'Week 52' 'Week 78' 'Week 104']

Baseline visit filtering result:
Is_Baseline_Visit
False    193
True      49
Name: count, dtype: int64

Baseline rows:
    participantId ImmPort Accession Visit participantId  4 Hour AUC  \
0    T1DAL_137962         SUB168890    -1  T1DAL_137962    0.860031   
9    T1DAL_142037         SUB168891    -1  T1DAL_142037    0.771851   
10   T1DAL_161919         SUB168892    -1  T1DAL_161919    0.523187   
19   T1DAL_185333         SUB168893 

In [12]:
import pandas as pd
import os

# --- Load file ---
file_path = "data/raw/SDY797_ADCPEP2_2019-03-01_10-25-10_ITN045AI.csv"
df = pd.read_csv(file_path)

# --- Columns of interest ---
participant_col = "participantId"
subject_col = "ImmPort Accession"
visit_col = "Visit"
auc_col = "4 Hour AUC"

# --- Filter baseline visits (Visit == -1) ---
df["Visit_Clean"] = df[visit_col].astype(str).str.strip()
df["Is_Baseline"] = df["Visit_Clean"] == "-1"
df_baseline = df[df["Is_Baseline"]].copy()

# --- Keep only needed columns ---
keep_cols = [subject_col, participant_col, visit_col, auc_col]
df_baseline = df_baseline[keep_cols].copy()

# --- Rename columns to tidy format ---
df_baseline = df_baseline.rename(columns={
    subject_col: "Subject_ID",
    participant_col: "Participant_ID",
    visit_col: "Visit_Label",
    auc_col: "C_Peptide_AUC_4Hrs"
})
df_baseline.insert(0, "Study", "SDY797")
df_baseline["Units"] = "NG/ML"  # Assumed consistent with SDY569

# --- Convert AUC to numeric ---
df_baseline["C_Peptide_AUC_4Hrs"] = pd.to_numeric(df_baseline["C_Peptide_AUC_4Hrs"], errors="coerce")

# --- Drop missing values ---
df_baseline = df_baseline.dropna(subset=["C_Peptide_AUC_4Hrs"])

# --- Save output ---
output_path = "data/SDY797_cpeptide_auc_tidy.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_baseline.to_csv(output_path, index=False)

# --- Report ---
print(f"[SDY797] Saved {len(df_baseline)} baseline C-peptide AUC rows.")
print(f"Saved to: {output_path}")


[SDY797] Saved 49 baseline C-peptide AUC rows.
Saved to: data/SDY797_cpeptide_auc_tidy.csv
