In [141]:
import pandas as pd
import numpy as np

In [147]:
df = pd.read_csv('TD_HOSPITAL_TRAIN.csv')

In [148]:
# print(df['sex'])

def attempt_convert_to_float(column):
    try:
        return column.astype(float)
    except ValueError:  # If conversion fails, return the original column
        return column

# Apply the function to each column
# Modify sex column without direct indexing
df.loc[df['sex'].isin(["male", "Male", "M", 1]), 'sex'] = 0
df.loc[~df['sex'].isin([0]), 'sex'] = 1

# Convert to float where possible
df = df.apply(attempt_convert_to_float)

# Impute missing values for float64 columns
for col in df.columns:
    if df[col].dtype == "float64":
        df[col].fillna(df[col].mean(), inplace=True)

# Create dummies
df = pd.get_dummies(df, columns=['dnr', 'race', 'primary', 'disability', 'income', 'extraprimary', 'cancer'], drop_first=True)

# Ensure columns are actually dropped
df = df.drop(['dose', 'pdeath'], axis=1)

nan_rows = df[df.isnull().any(axis=1)]
print(nan_rows)

# Explicit NaN check after processing
assert not df.isna().any().any(), "There are still NaN values in the data."

# Splitting data and training model remains the same


from sklearn.model_selection import train_test_split

X = df.drop('death', axis=1)  # Features
y = df['death']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

feature_importances = clf.feature_importances_
features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(clf, X_test, y_test)

perm_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': perm_importance.importances_mean
})

top_perm_features = perm_importance_df.sort_values(by='Importance', ascending=False).head(20)

top_10_features = top_perm_features['Feature'].head(10).tolist()

# Create a subset of the original dataframe using these top 10 features
test_df = df[top_10_features + ['death']]

# Save to csv
test_df.to_csv("test2.csv")



Empty DataFrame
Columns: [timeknown, cost, reflex, sex, blood, bloodchem1, bloodchem2, temperature, heart, psych1, glucose, psych2, psych3, bp, bloodchem3, confidence, bloodchem4, comorbidity, totalcost, breathing, age, sleep, bloodchem5, meals, pain, psych4, administratorcost, urine, diabetes, bloodchem6, education, psych5, psych6, information, death, dnr_dnr before sadm, dnr_no dnr, race_black, race_hispanic, race_other, race_white, primary_CHF, primary_COPD, primary_Cirrhosis, primary_Colon Cancer, primary_Coma, primary_Lung Cancer, primary_MOSF w/Malig, disability_Coma or Intub, disability_SIP>=30, disability_adl>=4 (>=5 if sur), disability_no(M2 and SIP pres), income_$25-$50k, income_>$50k, income_under $11k, extraprimary_COPD/CHF/Cirrhosis, extraprimary_Cancer, extraprimary_Coma, cancer_no, cancer_yes]
Index: []

[0 rows x 60 columns]


In [97]:

correlation = df.corr()['death'].sort_values(ascending=False)
# print(correlation)
# threshold = float(input("Enter the threshold value you would like to analyze:"))
# print(threshold)

threshold = 0.1

# Identify features that have a correlation magnitude above the threshold
significant_features = correlation[correlation.abs() > threshold].index.tolist()

# Exclude the target variable 'death' from the features list
significant_features.remove('death')

new_df = df[significant_features + ['death']]

new_df.to_csv('sig.csv')


In [157]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pd.read_csv('test2.csv')

if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)


# Split data into features and target variable
X = df.drop('death', axis=1)
y = df['death']

print(X)

# Standardize numerical features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
import joblib
joblib.dump(scaler, "scaler.pkl")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(800, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    
    tf.keras.layers.Dense(500, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(400, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    
    tf.keras.layers.Dense(300, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(1, activation='sigmoid'),
])

opt = tf.keras.optimizers.Adam(learning_rate=0.001)  # Adjust learning rate

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model.compile(opt, 
              loss='binary_crossentropy', 
              metrics=['accuracy'])


history = model.fit(X_train, y_train, 
                    epochs=200,  # Set a high number since early stopping is implemented
                    batch_size=32, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping])

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Sample prediction
sample_data = X_test.iloc[0]  # get the first row from the test set
predicted_prob = model.predict(np.array([sample_data]))[0][0]
# print(f"The probability of death is: {predicted_prob * 100:.2f}%")
# predicted_label = (predicted_prob > 0.5).astype(int)
# print(f"Predicted Label: {predicted_label}")

model.save('Mynewmodel.h5')





      timeknown  bloodchem4        urine     glucose  administratorcost  \
0           4.0   12.000000  5360.000000  157.000000           3525.000   
1         467.0   10.000000  2570.000000  271.000000          43200.000   
2         533.0   28.000000  1690.000000  117.000000           5894.000   
3          68.0   31.790697  2197.483816  159.695613          16717.000   
4        1605.0   31.790697  2197.483816  159.695613          10151.000   
...         ...         ...          ...         ...                ...   
7053      841.0   12.000000  1830.000000  159.695613          43891.000   
7054      258.0   39.000000  1130.000000  139.000000         106109.000   
7055     1325.0   31.790697  2197.483816  159.695613          21128.000   
7056        4.0   31.790697  2197.483816  159.695613          22312.328   
7057       14.0   17.000000  1840.000000   96.000000           5874.000   

      disability_no(M2 and SIP pres)  temperature  heart  education       age  
0                  

  saving_api.save_model(


In [158]:
def preprocess_data(df):
    def attempt_convert_to_float(column):
        try:
            return column.astype(float)
        except ValueError:  # If conversion fails, return the original column
            return column

    # Apply the function to each column
    # Modify sex column without direct indexing
    df.loc[df['sex'].isin(["male", "Male", "M", 1]), 'sex'] = 0
    df.loc[~df['sex'].isin([0]), 'sex'] = 1

    # Convert to float where possible
    df = df.apply(attempt_convert_to_float)

    # Impute missing values for float64 columns
    for col in df.columns:
        if df[col].dtype == "float64":
            df[col].fillna(df[col].mean(), inplace=True)

    # Create dummies
    df = pd.get_dummies(df, columns=['dnr', 'race', 'primary', 'disability', 'income', 'extraprimary', 'cancer'], drop_first=True)

    # Ensure columns are actually dropped
    df = df.drop(['dose', 'pdeath'], axis=1)

    return df

In [162]:
class Solution:
    def __init__(self):
        self.model = tf.keras.models.load_model('Mynewmodel.h5')
        self.scaler = joblib.load("scaler.pkl") # You may want to save and load this as well if you're running the preprocessing on different datasets
        self.significant_features = ['timeknown','bloodchem4','urine','glucose','administratorcost','disability_no(M2 and SIP pres)','temperature','heart','education','age'] # Populate this with the significant features if needed

    def calculate_death_prob(self, df):
        df = preprocess_data(df)
        # Only use significant features if needed
        X_new = df[self.significant_features]
        print(X_new)
        X_new_scaled = self.scaler.transform(X_new)
        prediction = self.model.predict(X_new_scaled)
        return float(prediction[0][0])

def q1():
    # data = request.json
    # df = pd.DataFrame(data)
    df = pd.read_csv('TD_HOSPITAL_TRAIN.csv')
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)
    solution = Solution()
    prob = solution.calculate_death_prob(df)
    return 


print(q1())
        
        




      timeknown  bloodchem4        urine     glucose  administratorcost  \
0           4.0   12.000000  5360.000000  157.000000           3525.000   
1         467.0   10.000000  2570.000000  271.000000          43200.000   
2         533.0   28.000000  1690.000000  117.000000           5894.000   
3          68.0   31.790697  2197.483816  159.695613          16717.000   
4        1605.0   31.790697  2197.483816  159.695613          10151.000   
...         ...         ...          ...         ...                ...   
7053      841.0   12.000000  1830.000000  159.695613          43891.000   
7054      258.0   39.000000  1130.000000  139.000000         106109.000   
7055     1325.0   31.790697  2197.483816  159.695613          21128.000   
7056        4.0   31.790697  2197.483816  159.695613          22312.328   
7057       14.0   17.000000  1840.000000   96.000000           5874.000   

      disability_no(M2 and SIP pres)  temperature  heart  education       age  
0                  

TypeError: 'float' object is not subscriptable

In [86]:
# app = Flask(__name__)

# @app.route("/death_probability", methods=["POST"])


# if __name__ == "__main__":
#     app.run(host="0.0.0.0", port=5555)



ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- administratorcost
- age
- blood
- bloodchem1
- bloodchem2
- ...
Feature names seen at fit time, yet now missing:
- Unnamed: 0
