In [1]:
using CSV, DataFrames

In [26]:
train_grouped = CSV.read("Training_Set_grouped_binary.csv", DataFrame)
test_grouped = CSV.read("Testing_Set_grouped_binary.csv", DataFrame);

train_grouped = train_grouped[:, 2:74]
test_grouped = test_grouped[:, 2:74];


In [27]:
train_X = train_grouped[:,1:72]
train_y = train_grouped[:,73];


In [28]:
test_X = test_grouped[:,1:72]
test_y = test_grouped[:,73];


## Sparse Logistic Regression

In [12]:
lnr = IAI.OptimalFeatureSelectionClassifier(random_seed=42)

grid_sparse = IAI.GridSearch(lnr,sparsity =1:10)

IAI.fit!(grid_sparse, train_X, train_y, validation_criterion = :auc)


All Grid Results:

[1m Row │[1m sparsity [1m train_score [1m valid_score [1m rank_valid_score
     │[90m Int64    [90m Float64     [90m Float64     [90m Int64
─────┼──────────────────────────────────────────────────────
   1 │        1    0.0431737     0.611594                10
   2 │        2    0.0505216     0.636861                 9
   3 │        3    0.0535855     0.642004                 8
   4 │        4    0.0570545     0.645507                 7
   5 │        5    0.0586357     0.649566                 6
   6 │        6    0.0602274     0.65159                  5
   7 │        7    0.0613839     0.655012                 4
   8 │        8    0.0620524     0.655939                 3
   9 │        9    0.062906      0.65686                  2
  10 │       10    0.0631958     0.65704                  1

Best Params:
  sparsity => 10

Best Model - Fitted OptimalFeatureSelectionClassifier:
  Constant: -1.56573
  Weights:
    admission_source_cleaned_2: -0.519927
    admiss

In [13]:
# Get the best parameters and the best model
best_params = IAI.get_best_params(grid_sparse)
best_model = IAI.get_learner(grid_sparse)

Fitted OptimalFeatureSelectionClassifier:
  Constant: -1.56573
  Weights:
    admission_source_cleaned_2: -0.519927
    admission_source_cleaned_3:  0.150432
    age:                         0.0049562
    diabetesMed_Yes:             0.23917
    number_diagnoses:            0.0704285
    number_emergency:            0.209445
    number_inpatient:            0.396986
    number_outpatient:           0.0838596
    race_Unknown:               -0.385395
    time_in_hospital:            0.0183156
  (Higher score indicates stronger prediction for class `1`)

In [14]:
#AUC
train_auc = IAI.score(grid_sparse,train_X, train_y,criterion=:auc)
test_auc = IAI.score(grid_sparse,test_X, test_y,criterion=:auc)
println("train_auc: ", train_auc)
println("test_auc: ", test_auc)

train_auc: 0.6650944392086889
test_auc: 0.6665585746897991


In [15]:
#Predict on test set
predictions = IAI.predict(grid_sparse, test_X);

In [16]:
results_df = DataFrame(Actual = test_y, Predicted = predictions)
CSV.write("predictions_3.csv", results_df)

"predictions_3.csv"

## Sparse Logistic Regression (with Key Features)

In [17]:
sub_features = [
    "age", "time_in_hospital", "num_lab_procedures", "num_procedures",
    "num_medications", "number_outpatient", "number_emergency", "number_inpatient",
    "number_diagnoses", "med_count_1", "med_count_2", "med_count_3",
    "med_count_4", "med_count_6", "race_Caucasian", "race_Hispanic",
    "race_Other", "race_Unknown", "gender_Male", "A1Cresult_>8",
    "A1Cresult_Norm", "A1Cresult_Not Taken", "diabetesMed_Yes",
    "med_change_1_NoMed", "med_change_1_Steady", "med_change_1_Up",
    "med_change_2_NoMed", "med_change_2_Steady", "med_change_2_Up",
    "med_change_3_NoMed", "med_change_3_Steady", "med_change_4_NoMed",
    "med_change_4_Steady", "med_change_6_NoMed", "med_change_6_Steady",
    "med_change_6_Up", "med_change_other_Yes", "discharge_group_2",
    "discharge_group_3", "discharge_group_4", "discharge_group_7",
    "admission_type_cleaned_1", "admission_type_cleaned_2",
    "admission_type_cleaned_3", "admission_source_cleaned_1",
    "admission_source_cleaned_2", "admission_source_cleaned_3",
    "diag1_category_1", "diag1_category_2", "diag1_category_3",
    "diag1_category_4", "diag1_category_5", "diag1_category_6",
    "diag1_category_7", "diag1_category_8", "diag1_category_9",
    "max_glu_serum_cleaned_>300", "max_glu_serum_cleaned_Norm",
    "max_glu_serum_cleaned_Not Taken","readmitted"
];


In [18]:
train_grouped_filtered = train_grouped[:, sub_features]
test_grouped_filtered = test_grouped[:, sub_features];


In [19]:
train_X = train_grouped_filtered[:,1:59]
train_y = train_grouped_filtered[:,60];


In [20]:
test_X = test_grouped_filtered[:,1:59]
test_y = test_grouped_filtered[:,60];


In [21]:
lnr = IAI.OptimalFeatureSelectionClassifier(random_seed=42)

grid_sparse = IAI.GridSearch(lnr,sparsity =1:10)

IAI.fit!(grid_sparse, train_X, train_y, validation_criterion = :auc)


All Grid Results:

[1m Row │[1m sparsity [1m train_score [1m valid_score [1m rank_valid_score
     │[90m Int64    [90m Float64     [90m Float64     [90m Int64
─────┼──────────────────────────────────────────────────────
   1 │        1    0.0431755     0.611594                10
   2 │        2    0.0505216     0.636861                 9
   3 │        3    0.0542588     0.640628                 8
   4 │        4    0.0570582     0.645749                 7
   5 │        5    0.0586787     0.648144                 6
   6 │        6    0.0602275     0.65159                  5
   7 │        7    0.0613839     0.655018                 4
   8 │        8    0.0621899     0.656952                 2
   9 │        9    0.062906      0.65686                  3
  10 │       10    0.0633406     0.6577                   1

Best Params:
  sparsity => 10

Best Model - Fitted OptimalFeatureSelectionClassifier:
  Constant: -1.52222
  Weights:
    admission_source_cleaned_2: -0.522728
    admiss

In [22]:
#AUC
train_auc = IAI.score(grid_sparse,train_X, train_y,criterion=:auc)
test_auc = IAI.score(grid_sparse,test_X, test_y,criterion=:auc)
println("train_auc: ", train_auc)
println("test_auc: ", test_auc)

train_auc: 0.6645398870457798
test_auc: 0.6648339899283103


In [23]:
#Predict on test set
predictions = IAI.predict(grid_sparse, test_X);

In [24]:
results_df = DataFrame(Actual = test_y, Predicted = predictions)
CSV.write("predictions_4.csv", results_df)

"predictions_4.csv"

## Random Forest

In [30]:
# lnr = IAI. RandomForestClassifier(random_seed=42)
# grid_rf = IAI.GridSearch(lnr,
#     max_depth=[5, 10, 20, 30],
#     minbucket=[1],
#     num_trees=[50,100,200,300])
    
# IAI.fit!(grid_rf, train_X, train_y, validation_criterion = :auc)
