In [1]:
import pandas as pd 
import numpy as np

In [None]:
class COPDPreprocessing:
    def __init__(self, filename):
        self.df = pd.read_csv(filename)
        self.continuous_columns = []

    def create_treatment_columns(self):
        # Fluid intakes
        conditions = [(self.df[col] != 0) for col in ['MEDS_220949.0', 'MEDS_225943.0', 'MEDS_225158.0']]
        self.df['Fluids_intakes'] = np.select(conditions, [self.df[col] for col in ['MEDS_220949.0', 'MEDS_225943.0', 'MEDS_225158.0']], default=0)
        self.df.drop(['MEDS_220949.0', 'MEDS_225943.0', 'MEDS_225158.0'], axis=1, inplace=True)

        # Sedation
        self.df['sedation'] = np.where(self.df['MEDS_222168.0'] != 0, self.df['MEDS_222168.0'], self.df['MEDS_225942.0'])
        self.df.drop(['MEDS_222168.0', 'MEDS_225942.0'], axis=1, inplace=True)

    def limit_missingness(self):
        percent_missing = self.df.isna().mean() * 100
        exclude_columns = ['sedation', 'Fluids_intakes']
        columns_to_drop = percent_missing[(percent_missing > 20) & (~percent_missing.index.isin(exclude_columns))].index
        self.df = self.df.drop(columns=columns_to_drop)

    def impute_missing_values(self):
        # Impute continuous columns with mean per patient
        self.continuous_columns = [col for col in self.df.select_dtypes(include=[np.number]).columns if col not in ['PatientID']]
        self.df[self.continuous_columns] = self.df.groupby('PatientID')[self.continuous_columns].transform(lambda x: x.fillna(x.mean()))

        # Fill remaining NaNs with zero
        self.df.fillna(0, inplace=True)

    def save_data(self, filename='COPD_1302.csv', pickle_filename='COPD_1302.pkl'):
        self.df.to_csv(filename, index=False)
        self.df.to_pickle(pickle_filename)

In [None]:
# Example usage
processor = COPDPreprocessing('FullFrame.csv')
processor.create_treatment_columns()
processor.limit_missingness()
processor.impute_missing_values()
processor.save_data()