In [4]:
# Author: Benge Johnathan C. 

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("prepData.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045547 entries, 0 to 1045546
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype
---  ------                --------------    -----
 0   AGE                   1045547 non-null  int64
 1   is_male               1045547 non-null  int64
 2   is_female             1045547 non-null  int64
 3   is_female_pregnant    1045547 non-null  int64
 4   is_hospitalized       1045547 non-null  int64
 5   is_diabetic           1045547 non-null  int64
 6   is_hypertensive       1045547 non-null  int64
 7   is_obese              1045547 non-null  int64
 8   is_copd               1045547 non-null  int64
 9   is_tobacco_user       1045547 non-null  int64
 10  is_heart_diseased     1045547 non-null  int64
 11  is_immune_surpressed  1045547 non-null  int64
 12  is_other_diseased     1045547 non-null  int64
 13  is_infant             1045547 non-null  int64
 14  is_toddler            1045547 non-null  int64
 15  is_child       

# Using LightGBM for Gradient Boosting
- Gradient Boosted Decision Trees
- XGBoost trees add more growth to 'depth' (no longer using XGBoost - added litte value)
    - XGBoost expands all nodes depth-wise and first split all nodes at a given depth before adding more levels.
- LightGBM applies more 'leaf growth'
    - LighGBM splits leaf nodes that maximize information gain

In [5]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

# Split the data into training and testing sets
X = df.drop('is_hospitalized', axis=1)
y = df['is_hospitalized']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Creating a RandomUnderSampler object to handle the undersampling
sampler = RandomUnderSampler(random_state=42)

# Using the sampler to fit and transform the training data
X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)

# Build and train the LightGBM model using the resampled training data
lgb_model = lgb.LGBMClassifier(num_leaves=55, max_bin=60, learning_rate=0.065, 
                                n_estimators=1000, max_depth=8, boosting_type="dart", 
                                objective='binary',class_weight='balanced', 
                                subsample=0.8, colsample_bytree=0.8)

lgb_model.fit(X_train_resampled, y_train_resampled)

# Use the LightGBM model to make predictions on the test set
lgb_predictions = lgb_model.predict(X_test)

In [6]:
# Evaluate the LightGBM model
accuracy = lgb_model.score(X_test, y_test)
print(f'Accuracy: {accuracy:.3f}')

# Initialize a counter for the number of incorrect predictions
incorrect_predictions = 0

# Iterate over the predicted and actual values
for y_hat, y in zip(lgb_predictions, y_test):
  # If the predicted value is not equal to the actual value, increment the counter
  if y_hat != y:
    incorrect_predictions += 1


Accuracy: 0.743


# Saving Trained Model with Pickle

In [7]:
import pickle

# Save the model to a file
with open('lgb_model.pkl', 'wb') as file:
    pickle.dump(lgb_model, file)

In [8]:
# Testing Predictions 

tester = X_test.iloc[5,:].values.reshape(1,-1)
testpredict = lgb_model.predict(tester)
print(testpredict)

print(tester)

[0]
[[16  0  1  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0]]
