In [2]:
#Import Libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
def clean_data(data):

    print("Original Dataset Size:", data.shape)
    
    #Remove Duplicates
    #data.drop_duplicates(inplace=True)

    #Create new Columns
    data['DEATH'] = (data['DATE_DIED'] != "9999-99-99").astype(int) + 1
    data['COVID'] = (data['CLASIFFICATION_FINAL'] <= 3).astype(int)
    data['HOSPITALIZED'] = (data['PATIENT_TYPE'] == 2).astype(int)

    #Remove all missing values from columns other than the ones that have the MOST missing values (ICU, INTUBED, PREGNANCY)
    cols_to_filter = ['PNEUMONIA', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION',
                  'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO','ICU','INTUBED']

    for col in cols_to_filter:
        data = data[data[col].isin([1, 2, 97])]

    #Change all values to 1 and 0 for predictor columns
        #Define the mapping for all columns
    mapping = {1: 1, 2: 0, 97:0}  # Define your mapping here dropped 99 as its real missing value, 97 is N/A so turned that to 0

        # Apply the mapping to all columns
    data = data.replace(mapping)


    #Create Prediction Column
    data['AT_RISK'] = data[['HOSPITALIZED','DEATH', 'INTUBED', 'ICU']].sum(axis=1)

    # Create Prediction Column
    data['AT_RISK'] = data[['HOSPITALIZED', 'DEATH', 'INTUBED', 'ICU']].sum(axis=1)

    # If COVID flag is not present, set 'AT_RISK_SCORE' to 0
    data.loc[data['COVID'] == 0, 'AT_RISK'] = 0

    # If COVID flag is present and the sum of the rest is 0, set 'AT_RISK_SCORE' to 1
    data.loc[(data['COVID'] == 1) & (data[['HOSPITALIZED', 'DEATH', 'INTUBED', 'ICU']].sum(axis=1) == 0), 'AT_RISK'] = 0

    # If COVID flag is present and the sum of the rest is >= 1, set 'AT_RISK_SCORE' to 2
    data.loc[(data['COVID'] == 1) & (data[['HOSPITALIZED', 'DEATH', 'INTUBED', 'ICU']].sum(axis=1) >= 1), 'AT_RISK'] = 1


    #data['AT_RISK_SCORE'] = np.where(data['AT_RISK_SCORE'] == 0, 0, data['AT_RISK_SCORE'] / max_at_risk_value)

    data['NUM_COMORBIDITIES'] = data[['DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION',
                                      'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO']].sum(axis=1)


    #Drop columns from the dataset that are redundant or not relavent to the model
    data = data.drop(columns=['CLASIFFICATION_FINAL','DATE_DIED','PATIENT_TYPE','MEDICAL_UNIT','USMER','PREGNANT'])

    # Define the columns to be converted to integer type
    cols_to_convert = [col for col in data.columns]

    # Convert the selected columns to integer type
    data[cols_to_convert] = data[cols_to_convert].astype(int)

    print("Final Dataset Size:", data.shape)

    return data

In [4]:
#Prepare data for modeling
def modeling_df_improved(df): 
    data = df

    #Did not drop these columns earlier because I need them for the viz
    data = data.drop(columns=['COVID','HOSPITALIZED','DEATH','INTUBED','ICU','INMSUPR', 'ASTHMA', 'TOBACCO','COPD','OTHER_DISEASE'])
    
    data['AGE_group'] = (data['AGE'] // 10) * 10
    data = data.drop(['AGE'], axis = 1)

    age_labels = [str(i) + '0s' for i in range(1, 11)]  # Create age labels such as '10s', '20s', ..., '100s'

    for label in age_labels:
        data[label] = (data['AGE_group'] == int(label[:2])).astype(int)

    data = data.drop(['AGE_group'], axis =1)

    return data

def random_forest(df):
    data = df

    X = data.drop('AT_RISK', axis=1)
    y = data['AT_RISK']

    # Split data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_clf.fit(X_train, y_train)
    y_pred = rf_clf.predict(X_test)
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    return rf_clf

In [5]:
#Import Dataset
df_original = pd.read_csv("Covid Data.csv")

#Apply Clean Data function to the the df
clean_df = clean_data(df_original)
#clean_df.info()

Original Dataset Size: (1068023, 21)
Final Dataset Size: (1041625, 20)


Write to CSV to use in the Tableau Viz \
Uncomment the code below when you want to re create the visualization dataset

In [6]:
#Write df to csv to be used for viz
#file_path = 'Covid_Clean.csv'
#clean_df.to_csv(file_path, index=False)

# Define the list of column names to group by
#groupby_cols = clean_df.columns.tolist()

# Select columns specified by the list of column names and perform groupby operation
#grouped_df = clean_df.groupby(groupby_cols).size().reset_index(name='counts')

#write df to csv to be used for Sankey viz
#file_path = 'Covid_Clean_Sankey.csv'
#grouped_df.to_csv(file_path, index=False)


In [7]:
#Improved model 

#Prepare dataset for modeling
modeling_data_final = modeling_df_improved(clean_df)

#Create Random Forest Model 
rf_model_final = random_forest(modeling_data_final)

Accuracy: 0.6550971557307801


In [28]:
import tabpy_client
from tabpy.tabpy_tools.client import Client
client = tabpy_client.Client('http://localhost:9004/')

In [34]:
def prediction_model(_arg1, _arg2, _arg3, _arg4, _arg5, _arg6, _arg7, _arg8, _arg9): 
    import pandas as pd

    _arg9 = _arg9[0]
    _arg9 = _arg9 // 10

    row = {
        'SEX': _arg1,
        'PNEUMONIA': _arg2,
        'DIABETES': _arg3,
        'HIPERTENSION': _arg4,
        'CARDIOVASCULAR': _arg5,
        'OBESITY': _arg6,
        'RENAL_CHRONIC': _arg7,
        'NUM_COMORBIDITIES': _arg8,
        '10s': 1 if _arg9 == 1 else 0,
        '20s': 1 if _arg9 == 2 else 0,
        '30s': 1 if _arg9 == 3 else 0,
        '40s': 1 if _arg9 == 4 else 0,
        '50s': 1 if _arg9 == 5 else 0,
        '60s': 1 if _arg9 == 6 else 0,
        '70s': 1 if _arg9 == 7 else 0,
        '80s': 1 if _arg9 == 8 else 0,
        '90s': 1 if _arg9 == 9 else 0,
        '100s': 1 if _arg9 == 10 else 0
    }

    test_data = pd.DataFrame(data = row,index=[0])
    #Predict the Fraud
    output = rf_model_final.predict(test_data)

    return output.tolist()

In [35]:
client.deploy('prediction_model', prediction_model,'prediction_model score',override = True)