In [None]:
# import relevant libraries and modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None) 

In [None]:
df = pd.read_csv("Preprocessed ICU data.csv")

In [None]:
from sklearn.decomposition import PCA

In [None]:
df.Gender.unique()

array([1.        , 0.        , 0.56192144])

In [None]:
df = df[~np.isclose(df["Gender"].round(8), 0.56192144,atol=1e-8)]

In [None]:
df.Gender.unique()

array([1., 0.])

In [None]:
gen = df.iloc[:,:7]
mean1 = df.iloc[:,7:44]
min1 = df.iloc[:,44:81]
max1 = df.iloc[:,81:118]
mean2 = df.iloc[:,118:155]
min2 = df.iloc[:,155:192]
max2 = df.iloc[:,192:229]
outcome = df.iloc[:,229:]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# gen = gen.drop("RecordID", axis =1)
first = pd.concat([mean1,max1,min1], axis = 1)

rand_samp1 = mean1.sample(7, axis= 1, ignore_index = True, replace = False, random_state = 42)
rand_samp2 = mean1.sample(7, axis= 1, ignore_index = True, replace = False, random_state = 4)
rand_samp3 = mean1.sample(7, axis= 1, ignore_index = True, replace = False, random_state = 9)

In [None]:
try1 = pd.concat([gen[["Age", "Height"]],mean1], axis = 1)

In [None]:
X = rand_samp1 # input features
y = outcome["In.hospital_death"] # target variable

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components = 3)
X_pca = pca.fit_transform(X_scaled)

In [None]:
print("Explained variance: ", pca.explained_variance_ratio_)
print("Cumulative: ", np.cumsum(pca.explained_variance_ratio_))

In [None]:
X_reconstructed = pca.inverse_transform(X_pca)
reconstruction_loss = np.mean((X_scaled - X_reconstructed) ** 2)
print(f"Reconstruction Loss: {reconstruction_loss:.4f}")

In [None]:
# create a function to be able to sample particular datasets, and display the results of the dataframe

def display_pca(dataset,n, size, components): # given a dataset, n is the number of random samples, s is the sample size
    # results = pd.DataFrame()
    # seeds = list(range(1,n+1))
    mean_variance = 0
    mean_reconstruction_loss = 0
    for i in range(n):
        rand_samp = dataset.sample(size, axis = 1, ignore_index = True, replace = False, random_state = i)
        X = rand_samp
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        pca = PCA(n_components = components)
        X_pca = pca.fit_transform(X_scaled)
        X_reconstructed = pca.inverse_transform(X_pca)
        reconstruction_loss = np.mean((X_scaled - X_reconstructed) ** 2)
        print(f"Explained variance: {pca.explained_variance_ratio_}\nCumulative: {np.cumsum(pca.explained_variance_ratio_)}\nTrial no.: {i+1}\nReconstruction Loss: {reconstruction_loss:.4f}\n")
        mean_variance = (mean_variance*i+np.cumsum(pca.explained_variance_ratio_)[-1])/(i+1)
        mean_reconstruction_loss = (mean_reconstruction_loss*i+reconstruction_loss)/(i+1)
    print(f"Mean variance: {mean_variance}\nMean reconstruction loss: {mean_reconstruction_loss}")
        

In [None]:
display_pca(mean1,10,8,3)

In [None]:
def display_pca2(dataset: pd.DataFrame,n: int, size: int, components: int) -> pd.DataFrame: # given a dataset, n is the number of random samples, s is the sample size
    """
    This
    """
    results = pd.DataFrame(columns=["Trial no.", "Cumulative variance", "Reconstruction Loss", "Features chosen"])
    mean_variance = 0
    mean_reconstruction_loss = 0
    for i in range(n):
        rand_samp = dataset.sample(size, axis = 1, ignore_index = True, replace = False, random_state = i)
        cols = rand_samp.columns.tolist()
        X = rand_samp
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        pca = PCA(n_components = components)
        X_pca = pca.fit_transform(X_scaled)
        X_reconstructed = pca.inverse_transform(X_pca)
        reconstruction_loss = np.mean((X_scaled - X_reconstructed) ** 2)
        cum_var = np.cumsum(pca.explained_variance_ratio_)[-1]
        new_row = pd.DataFrame({"Trial no.": [i+1], "Cumulative variance": [cum_var], "Reconstruction Loss": [reconstruction_loss], "Features chosen": [cols]})
        results = pd.concat([results, new_row], axis = 0, ignore_index = True)
       

        
        mean_variance = (mean_variance*i+cum_var)/(i+1)
        mean_reconstruction_loss = (mean_reconstruction_loss*i+reconstruction_loss)/(i+1)

    results = pd.concat([results, pd.DataFrame({"Trial no.": ["Means"], "Cumulative variance": [mean_variance], "Reconstruction Loss": [mean_reconstruction_loss]})], axis = 0, ignore_index = True)
    
    return results

In [None]:
results2 = display_pca2(mean1,20,6,3)
results3 = display_pca2(mean2,20,6,3)
results4 = display_pca2(min1,20,6,3)
results5 = display_pca2(min2,20,6,3)
results6 = display_pca2(max1,20,6,3)
results7 = display_pca2(max2,20,6,3)
results8 = display_pca2(first,50,6,3)

In [None]:
results2

In [None]:
results3

In [None]:
results4

In [None]:
results5

In [None]:
results6

In [None]:
results7

In [None]:
results8

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

X = mean1

rfe = RFE(estimator = RandomForestClassifier(), n_features_to_select = 5)
model = RandomForestClassifier()
pipeline = Pipeline(steps=[("Feature Selection", rfe), ("Model", model)])


# evaluating the model

cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 5, random_state=1)

results = cross_validate(pipeline, X, y, scoring="accuracy", cv=cv, return_estimator = True)
n_scores = cross_validate(pipeline, X,y, scoring="accuracy", cv=cv, n_jobs=1)

print(f"Accuracy: {(n_scores)} {(n_scores)}")

In [49]:
import numpy as np

# Create a 3x4 array with random integers between 0 and 255
array = np.random.randint(0, 256, size=(3, 4, 3))
print(array)

[[[199  69  18]
  [143  88 159]
  [201  68 186]
  [112 187 162]]

 [[181  36 173]
  [ 39 193 190]
  [238  18  12]
  [ 51 169 123]]

 [[138 106  54]
  [147 236  42]
  [178  53 202]
  [ 89 140  90]]]
