In [1]:
!pip install seclea-ai==1.0.2
!conda install seclea-ai==1.0.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seclea-ai==1.0.2
  Downloading seclea_ai-1.0.2-py3-none-any.whl (281 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting pickleDB>=0.9.2
  Downloading pickleDB-0.9.2.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pickleDB
  Building wheel for pickleDB (setup.py) ... [?25l[?25hdone
  Created wheel for pickleDB: filename=pickleDB-0.9.2-py3-none-any.whl size=4269 sha256=0d9bef5586e4b83073d5f0893c57ad73d6df461c66afbf2982c4045b98cf90a6
  Stored in directory: /root/.cache/pip/wheels/88/91/d4/ef2e6a46ad2bc41f9cfad35fa2db5b34357a5e4da67c385ffa
Successfully built pickleDB
Installing collected packages: pickleDB, seclea-ai
Successfully installed pickleDB-0.9.2 seclea-ai-1.0.2
/bin/bash: conda: command not found


In [2]:
from seclea_ai import SecleaAI

# NOTE - use the organization name provided to you when issued credentials.
seclea = SecleaAI(project_name="HeartFailure", organization='Seclea')

Username: mariaantony
Password: ··········
success


In [3]:
import numpy as np
import pandas as pd

# load the data
data = pd.read_csv('/content/heart_failure_clinical_records_dataset.csv')

# define the metadata for the dataset.
dataset_metadata = {"outcome_name": "DEATH_EVENT",
                    "favourable_outcome": "1",
                    "unfavourable_outcome": "0",
                    "continuous_features": [
                                            "age",
                                            'anaemia',
                                            'creatinine_phosphokinase',
                                            'diabetes',
                                            'ejection_fraction',
                                            'high_blood_pressure',
                                            'platelets',
                                            'serum_creatinine',
                                            'serum_sodium',
                                            'sex',
                                            'smoking',
                                            'time',
                                            'DEATH_EVENT',
                                            ]}


# ⬆️ upload the dataset - pick a meaningful name here, you'll be seeing it a lot on the platform!
seclea.upload_dataset(dataset=data, dataset_name="Heart Failure", metadata=dataset_metadata)

In [4]:
# Create a copy to isolate the original dataset
df1 = data.copy(deep=True)

def encode_nans(df):
    # convert the special characters to nans
    return df.replace('?', np.NaN)

df2 = encode_nans(df1)

In [5]:
## Drop the the column which are more than some proportion NaN values
def drop_nulls(df, threshold):
    cols = [x for x in df.columns if df[x].isnull().sum() / df.shape[0] > threshold]
    return df.drop(columns=cols)

# We choose 95% as our threshold
null_thresh = 0.95
df3 = drop_nulls(df2, threshold=null_thresh)

def drop_correlated(data, thresh):
    import numpy as np

    # calculate correlations
    corr_matrix = data.corr().abs()
    # get the upper part of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # columns with correlation above threshold
    redundant = [column for column in upper.columns if any(upper[column] >= thresh)]
    print(f"Columns to drop with correlation > {thresh}: {redundant}")
    new_data = data.drop(columns=redundant)
    return new_data

# drop columns that are too closely correlated
correlation_threshold = 0.95
df4 = drop_correlated(df3, correlation_threshold)

Columns to drop with correlation > 0.95: []


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [6]:
from seclea_ai.transformations import DatasetTransformation


# define the updates to the metadata - only changes are updated - here a continuous feature has been dropped so now
# we remove it from the list of continuous features.
processed_metadata = {"continuous_features": [
                                            "age",
                                            'anaemia',
                                            'creatinine_phosphokinase',
                                            'diabetes',
                                            'ejection_fraction',
                                            'high_blood_pressure',
                                            'platelets',
                                            'serum_creatinine',
                                            'serum_sodium',
                                            'sex',
                                            'smoking',
                                            'time',
                                            'DEATH_EVENT',
                                            ]}

# 🔀 define the transformations - note the arguments
cleaning_transformations = [
            DatasetTransformation(encode_nans, data_kwargs={"df": df1}, kwargs={}, outputs=["df"]),
            DatasetTransformation(
                drop_nulls, data_kwargs={"df": "inherit"}, kwargs={"threshold": null_thresh}, outputs=["data"]
            ),
            DatasetTransformation(
                drop_correlated, data_kwargs={"data": "inherit"}, kwargs={"thresh": correlation_threshold}, outputs=["df"]
            ),
        ]



def fill_nan_const(df, val):
    """Fill NaN values in the dataframe with a constant value"""
    return df.replace(['None', np.nan], val)


# Fill nans in 1st dataset with -1
const_val = -1
df_const = fill_nan_const(df4, const_val)

def fill_nan_mode(df, columns):
    """
    Fills nans in specified columns with the mode of that column
    Note that we want to make sure to not modify the dataset we passed in but to
    return a new copy.
    We do that by making a copy and specifying deep=True.
    """
    new_df = df.copy(deep=True)
    for col in df.columns:
        if col in columns:
            new_df[col] = df[col].fillna(df[col].mode()[0])
    return new_df

nan_cols = ['high_blood_pressure','platelets', 'serum_creatinine']
df_mode = fill_nan_mode(df4, nan_cols)



# find columns with categorical data for both dataset
cat_cols = df_const.select_dtypes(include=['object']).columns.tolist()

def encode_categorical(df, cat_cols):
  from sklearn.preprocessing import LabelEncoder

  new_df = df.copy(deep=True)
  for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        le.fit(list(df[col].astype(str).values))
        new_df[col] = le.transform(list(df[col].astype(str).values))
  return new_df

df_const = encode_categorical(df_const, cat_cols)
df_mode = encode_categorical(df_mode, cat_cols)

# Update metadata with new encoded values for the outcome column.
encoded_metadata = {"favourable_outcome": 0,
                    "unfavourable_outcome": 1,}


# 🔀 define the transformations - for the constant fill dataset
const_processed_transformations = [
    DatasetTransformation(fill_nan_const, data_kwargs={"df": df4}, kwargs={"val": const_val}, outputs=["df"]),
    DatasetTransformation(encode_categorical, data_kwargs={"df": "inherit"}, kwargs={"cat_cols":cat_cols}, outputs=["df"]),
]



# 🔀 define the transformations - for the mode fill dataset
mode_processed_transformations = [
    DatasetTransformation(fill_nan_mode, data_kwargs={"df": df4}, kwargs={"columns": nan_cols}, outputs=["df"]),
    DatasetTransformation(encode_categorical, data_kwargs={"df": "inherit"}, kwargs={"cat_cols": cat_cols}, outputs=["df"]),
]



def get_samples_labels(df, output_col):
    X = df.drop(output_col, axis=1)
    y = df[output_col]

    return X, y

# split the datasets into samples and labels ready for modelling.
X_const, y_const = get_samples_labels(df_const, "DEATH_EVENT")
X_mode, y_mode = get_samples_labels(df_mode, "DEATH_EVENT")

def get_test_train_splits(X, y, test_size, random_state):
    from sklearn.model_selection import train_test_split

    return train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    # returns X_train, X_test, y_train, y_test

# split into test and train sets
X_train_const, X_test_const, y_train_const, y_test_const = get_test_train_splits(X_const, y_const, test_size=0.2, random_state=42)
X_train_mode, X_test_mode, y_train_mode, y_test_mode = get_test_train_splits(X_mode, y_mode, test_size=0.2, random_state=42)

# 🔀 define the transformations - for the constant fill training set
const_train_transformations = [
    DatasetTransformation(
            get_test_train_splits,
            data_kwargs={"X": X_const, "y": y_const},
            kwargs={"test_size": 0.2, "random_state": 42},
            outputs=["X_train_const", None, "y_train_const", None],
            split="train",
            ),
]

# ⬆️ upload the const fill training set
#seclea.upload_dataset_split(
 #                       X=X_train_const,
  #                      y=y_train_const,
   #                     dataset_name="Heart failure - Const Fill - Train",
    #                    metadata={},
     #                   transformations=const_train_transformations
#)

# 🔀 define the transformations - for the constant fill test set
const_test_transformations = [
    DatasetTransformation(
            get_test_train_splits,
            data_kwargs={"X": X_const, "y": y_const},
            kwargs={"test_size": 0.2, "random_state": 42},
            outputs=[None, "X_test_const", None, "y_test_const"],
            split="test"
            ),
]

# ⬆️ upload the const fill test set
#seclea.upload_dataset_split(X=X_test_const,
#                     y=y_test_const,
 #                     dataset_name="Heart failure - Const Fill - Test",
  #                    metadata={},
   #                   transformations=const_test_transformations)

# 🔀 define the transformations - for the mode fill training set
mode_train_transformations = [
    DatasetTransformation(
            get_test_train_splits,
            data_kwargs={"X": X_mode, "y": y_mode},
            kwargs={"test_size": 0.2, "random_state": 42},
            outputs=["X_train_mode", None, "y_train_mode", None],
            split="train",
            ),
]

# ⬆️ upload the mode fill train set
#seclea.upload_dataset_split(X=X_train_mode,
 #                     y=y_train_mode,
  #                    dataset_name="Heart failure - Mode Fill - Train",
   #                   metadata=processed_metadata,
    #                  transformations=mode_train_transformations)

# 🔀 define the transformations - for the mode fill test set
mode_test_transformations = [
    DatasetTransformation(
            get_test_train_splits,
            data_kwargs={"X": X_mode, "y": y_mode},
            kwargs={"test_size": 0.2, "random_state": 42},
            outputs=[None, "X_test_mode", None, "y_test_mode"],
            split="test",
            ),
]

# ⬆️ upload the mode fill test set
#seclea.upload_dataset_split(X=X_test_mode,
 #                     y=y_test_mode,
  #                    dataset_name="Heart failure - Mode Fill - Test",
   #                   metadata={},
    #                  transformations=mode_test_transformations)



def smote_balance(X, y, random_state):
    from imblearn.over_sampling import SMOTE

    sm = SMOTE(random_state=random_state)

    X_sm, y_sm = sm.fit_resample(X, y)

    print(
        f"""Shape of X before SMOTE: {X.shape}
    Shape of X after SMOTE: {X_sm.shape}"""
    )
    print(
        f"""Shape of y before SMOTE: {y.shape}
    Shape of y after SMOTE: {y_sm.shape}"""
    )
    return X_sm, y_sm
    # returns X, y

# balance the training sets - creating new training sets for comparison
X_train_const_smote, y_train_const_smote = smote_balance(X_train_const, y_train_const, random_state=42)
X_train_mode_smote, y_train_mode_smote = smote_balance(X_train_mode, y_train_mode, random_state=42)

# 🔀 define the transformations - for the constant fill balanced train set
const_smote_transformations = [
    DatasetTransformation(
            smote_balance,
            data_kwargs={"X": X_train_const, "y": y_train_const},
            kwargs={"random_state": 42},
            outputs=["X", "y"]
            ),
]

# ⬆️ upload the constant fill balanced train set
#seclea.upload_dataset_split(X=X_train_const_smote,
 #                     y=y_train_const_smote,
  #                    dataset_name="Heart failure - Const Fill - Smote Train",
   #                   metadata={},
    #                  transformations=const_smote_transformations)

# 🔀 define the transformations - for the mode fill balanced train set
mode_smote_transformations = [
    DatasetTransformation(
            smote_balance,
            data_kwargs={"X": X_train_mode, "y": y_train_mode},
            kwargs={"random_state": 42},
            outputs=["X", "y"]
            ),
]

# ⬆️ upload the mode fill balanced train set
#seclea.upload_dataset_split(X=X_train_mode_smote,
 #                     y=y_train_mode_smote,
  #                    dataset_name="Heart failure - Mode Fill - Smote Train",
   #                   metadata={},
    #                  transformations=mode_smote_transformations)

Shape of X before SMOTE: (239, 12)
    Shape of X after SMOTE: (324, 12)
Shape of y before SMOTE: (239,)
    Shape of y after SMOTE: (324,)
Shape of X before SMOTE: (239, 12)
    Shape of X after SMOTE: (324, 12)
Shape of y before SMOTE: (239,)
    Shape of y after SMOTE: (324,)


In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

classifiers = {
    "RandomForestClassifier": RandomForestClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier()
}

datasets = [
    ("Const Fill", (X_train_const, X_test_const, y_train_const, y_test_const)),
    ("Mode Fill", (X_train_mode, X_test_mode, y_train_mode, y_test_mode)),
    ("Const Fill Smote", (X_train_const_smote, X_test_const, y_train_const_smote, y_test_const)),
    ("Mode Fill Smote", (X_train_mode_smote, X_test_mode, y_train_mode_smote, y_test_mode))
    ]

for name, (X_train, X_test, y_train, y_test) in datasets:

    for key, classifier in classifiers.items():
        # cross validate to get an idea of generalisation.
        training_score = cross_val_score(classifier, X_train, y_train, cv=5)

        # train on the full training set
        classifier.fit(X_train, y_train)

        # ⬆️ upload the fully trained model
       # seclea.upload_training_run_split(model=classifier, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

        # test accuracy
        y_preds = classifier.predict(X_test)
        test_score = accuracy_score(y_test, y_preds)
        print(f"Classifier: {classifier.__class__.__name__} has a training score of {round(training_score.mean(), 3) * 100}% accuracy score on {name}")
        print(f"Classifier: {classifier.__class__.__name__} has a test score of {round(test_score, 3) * 100}% accuracy score on {name}")

Classifier: RandomForestClassifier has a training score of 85.39999999999999% accuracy score on Const Fill
Classifier: RandomForestClassifier has a test score of 83.3% accuracy score on Const Fill
Classifier: DecisionTreeClassifier has a training score of 76.2% accuracy score on Const Fill
Classifier: DecisionTreeClassifier has a test score of 76.7% accuracy score on Const Fill
Classifier: GradientBoostingClassifier has a training score of 84.89999999999999% accuracy score on Const Fill
Classifier: GradientBoostingClassifier has a test score of 80.0% accuracy score on Const Fill
Classifier: RandomForestClassifier has a training score of 85.39999999999999% accuracy score on Mode Fill
Classifier: RandomForestClassifier has a test score of 81.69999999999999% accuracy score on Mode Fill
Classifier: DecisionTreeClassifier has a training score of 77.8% accuracy score on Mode Fill
Classifier: DecisionTreeClassifier has a test score of 71.7% accuracy score on Mode Fill
Classifier: GradientBoos