# Antibody processing with updated datasets in March

In [None]:
import pandas as pd

antibody_data = pd.read_csv('C:/Per/LaiJiang/Project/ADDAM/dat/antibody_filtered.csv')

print("Antibody data format:")
print(antibody_data.columns)
#print(antibody_data.shape)
#print(len(antibody_data['Subject'].unique()))



#print the 81-85th rows
#print(antibody_data.iloc[80:83])

#remove the space and special characters from the column Subject
antibody_data['Subject'] = antibody_data['Subject'].str.replace(' ', '')
antibody_data['Subject'] = antibody_data['Subject'].str.replace('\xa0', '').str.strip()
#print(antibody_data.iloc[80:83])


#now we match the genotype data with the clinical data, antibody data
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Subset the combined_data by the cluster_subject_id
cluster_antibody_data = antibody_data

# Print dimension of the subsetted cluster_antibody_data
#print("Combined antibody data dimensions:", cluster_antibody_data.shape)

# Antibody list of columns to work with
columns_antibody_list = ['96GAD Qual', '96GAD Index', 'Draw Date', 'IA2 Qual', 'IA2 Index', 'ZnT8 Qual', 'ZnT8 Index']

# Convert 'Draw Date' to datetime for proper sorting

# 1) Coerce invalid dates (like "No date") to NaT
cluster_antibody_data['Draw Date'] = pd.to_datetime(
    cluster_antibody_data['Draw Date'],
    errors='coerce'   # invalid parsing → NaT
)



# Sort data by 'Subject' and 'Draw Date' in descending order (most recent first)
cluster_antibody_data = cluster_antibody_data.sort_values(by=['Subject', 'Draw Date'], ascending=[True, False])



#print("Combined antibody data dimensions:", cluster_antibody_data.shape)

#print the length of the unique Subject in cluster_antibody_data
print("Unique Subject in the combined antibody data:", len(cluster_antibody_data['Subject'].unique()))

# Function to get the most recent non-missing value for each antibody column
def get_most_recent_non_missing(group):
    # Initialize a dictionary to store the most recent non-missing values
    result = {'Subject': group['Subject'].iloc[0], 'Draw Date': group['Draw Date'].iloc[0]}
    
    # Define valid values for the 'Qual' columns
    valid_qual_values = ['POS', 'neg']
    
    # Process each column
    for col in columns_antibody_list:
        if col in ['96GAD Qual', 'IA2 Qual', 'ZnT8 Qual']:
            # Ensure only "POS" or "neg" values are kept, otherwise set to None
            result[col] = next((val for val in group[col] if val in valid_qual_values), None)
        else:
            # For other columns, get the first non-missing value
            result[col] = group[col].dropna().iloc[0] if not group[col].dropna().empty else None
    return pd.Series(result)

# Group by 'Subject' and apply the function
most_recent_antibodies = cluster_antibody_data.groupby('Subject').apply(get_most_recent_non_missing)

# Print the dimensions of the final result
print(" matched subject (genotype, clinical and antibody) antibody data dimension:", most_recent_antibodies.shape)

#print(most_recent_antibodies.head())
#print(most_recent_antibodies.shape)

#save the final result to /dat/antibody_final.csv
most_recent_antibodies.to_csv('C:/Per/LaiJiang/Project/ADDAM/HP/dat/2_3_atb_ori.csv', index=False)



Antibody data format:
Index(['Subject', 'Draw Date', '96GAD Qual', '96GAD Index', 'IA2 Qual',
       'IA2 Index', 'ZnT8 Qual', 'ZnT8 Index'],
      dtype='object')
Unique Subject in the combined antibody data: 1377
 matched subject (genotype, clinical and antibody) antibody data dimension: (1377, 8)


## Now process the updated antibody data

In [None]:
import pandas as pd

antibody_data = pd.read_csv('C:/Per/LaiJiang/Project/ADDAM/dat/antibdy_updates_march.csv')

print("Antibody data format:")
print(antibody_data.columns)
#print(antibody_data.shape)
#print(len(antibody_data['Subject'].unique()))

#print(antibody_data.iloc[80:83])

#remove the space and special characters from the column Subject
antibody_data['Subject'] = antibody_data['Subject'].str.replace(' ', '')
antibody_data['Subject'] = antibody_data['Subject'].str.replace('\xa0', '').str.strip()
#print(antibody_data.iloc[80:83])


#now we match the genotype data with the clinical data, antibody data
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Subset the combined_data by the cluster_subject_id
cluster_antibody_data = antibody_data

#print("Combined antibody data dimensions:", cluster_antibody_data.shape)

# Antibody list of columns to work with
columns_antibody_list = ['96GAD Qual ≥0.070', '96GAD Index', 'Draw Date', 'IA2 Qual ≥0.010', 'IA2 Index', 'ZnT8 Qual ≥0.150', 'ZnT8 Index']

# Convert 'Draw Date' to datetime for proper sorting

# 1) Coerce invalid dates (like "No date") to NaT
cluster_antibody_data['Draw Date'] = pd.to_datetime(
    cluster_antibody_data['Draw Date'],
    errors='coerce'   # invalid parsing → NaT
)


# Sort data by 'Subject' and 'Draw Date' in descending order (most recent first)
cluster_antibody_data = cluster_antibody_data.sort_values(by=['Subject', 'Draw Date'], ascending=[True, False])


#print("Combined antibody data dimensions:", cluster_antibody_data.shape)

#print the length of the unique Subject in cluster_antibody_data
print("Unique Subject in the combined antibody data:", len(cluster_antibody_data['Subject'].unique()))

# Function to get the most recent non-missing value for each antibody column
def get_most_recent_non_missing(group):
    # Initialize a dictionary to store the most recent non-missing values
    result = {'Subject': group['Subject'].iloc[0], 'Draw Date': group['Draw Date'].iloc[0]}
    
    # Define valid values for the 'Qual' columns
    valid_qual_values = ['POS', 'neg']
    
    # Process each column
    for col in columns_antibody_list:
        if col in ['96GAD Qual', 'IA2 Qual', 'ZnT8 Qual']:
            # Ensure only "POS" or "neg" values are kept, otherwise set to None
            result[col] = next((val for val in group[col] if val in valid_qual_values), None)
        else:
            # For other columns, get the first non-missing value
            result[col] = group[col].dropna().iloc[0] if not group[col].dropna().empty else None
    return pd.Series(result)

# Group by 'Subject' and apply the function
most_recent_antibodies = cluster_antibody_data.groupby('Subject').apply(get_most_recent_non_missing)

# Print the dimensions of the final result
print(" matched subject (genotype, clinical and antibody) antibody data dimension:", most_recent_antibodies.shape)

#print(most_recent_antibodies.head())

#print(most_recent_antibodies.shape)

#save the final result to /dat/antibody_final.csv
most_recent_antibodies.to_csv('C:/Per/LaiJiang/Project/ADDAM/HP/dat/2_3_atb_updates.csv', index=False)



FileNotFoundError: [Errno 2] No such file or directory: 'C:/Per/LaiJiang/Project/ADDAM/dat/antibody_updates_march.csv'