In [3]:
import pandas as pd
import os

# Load the dataset
main_dir= "/home/ulaval.ca/lesee/projects/Project-NLST/"

#read the extracted radiomics for cohort 1 and 2 
df = pd.read_excel(os.path.join(main_dir,'data/data_radiomics/radiomicsfeatures_kheops-NLST-Dmitrii-Cohort1_Laptop_v2.xlsx')) 


# Assuming you want to standardize the Kernel for a given PatientID
# First, sort by PatientID and StudyDate to ensure consistency
df = df.sort_values(by=['PatientID', 'StudyDate', 'Kernel'])

# Method to standardize Kernel - here, we'll just take the first Kernel for each PatientID as the standard
standard_kernels = df.groupby('PatientID')['Kernel'].first().reset_index()
standard_kernels.rename(columns={'Kernel': 'StandardKernel'}, inplace=True)

# Merge the standard kernel back to the original dataframe
df = pd.merge(df, standard_kernels, on='PatientID', how='left')

# Now, ensure we only have one row per PatientID and StudyDate
df_deduplicated = df.drop_duplicates(subset=['PatientID', 'StudyDate']).copy()  # Use .copy() here

# Instead of directly assigning values which leads to SettingWithCopyWarning,
# Use .loc for setting values in a DataFrame safely
df_deduplicated.loc[:, 'Kernel'] = df_deduplicated['StandardKernel'].values
df_deduplicated.drop(columns=['StandardKernel'], inplace=True)

# Save the cleaned data to a new Excel file
output_path = os.path.join(main_dir,'data/data_radiomics/radiomicsfeatures_kheops-NLST-Dmitrii-Cohort1_Laptop_v2-cleaned.xlsx')
df_deduplicated.to_excel(output_path, index=False)

print("File has been cleaned and saved.")


File has been cleaned and saved.
