-
Notifications
You must be signed in to change notification settings - Fork 21
/
3_h2o_deeplearning_imputation.py
128 lines (114 loc) · 5.51 KB
/
3_h2o_deeplearning_imputation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from __future__ import division
import sys, time, csv, h2o
import pandas as pd
import numpy as np
arg = sys.argv
print "Running script:", sys.argv[0]
arg = sys.argv[1:]
print "Arguments passed to script:", arg
load_data_fp = arg[0]
saving_meanImputed_fp = arg[1]
saving_modelImputed_fp = arg[2]
saving_means_fp = arg[3]
saving_models_fp = arg[4]
predictors = arg[5:]
# GWP_lag is treated as an int variable. It has no missings, so no need to impute it.
# But to keep this scripts code simple I impute anything with 'lag' in the var name.
to_impute = [var for var in predictors if 'lag' in var]
h2o.init(min_mem_size_GB=200, max_mem_size_GB = 225)
d = h2o.import_frame(path = load_data_fp)
#######################################################################
print "Making 'time_period' a factor..."
d['time_period'] = d['time_period'].asfactor()
assert d['time_period'].isfactor()
print d.levels(col='time_period')
d.describe()
def impute_data(method = "mean",
to_impute = to_impute,
predictors = predictors):
if method == "mean":
print "Mean imputing missing data for predictors:", to_impute
# find mean for each time period in data for each predictor, save them in a matrix with a col for the mean values of each predictor
# then on holdout use this table to fill in all missing values based on the time period (row) and the variable (col) of this matrix
#if using python module h2o-3.1.0.3131: grouped = data.group_by(["time_period"])
# gm = [grouped.mean(predictor, na="rm").get_frame() for predictor in to_impute]
gm = d["time_period"].unique()
print "Finding means..."
for predictor in to_impute:
gm = gm.cbind(d.group_by(["time_period"], {predictor:["mean", d.names().index(predictor), "rm"]}, order_by = 0))
gm.show()
print "Saving the imputation means to disk..."
h2o.download_csv(gm, filename = saving_means_fp)
# df_py = h2o.as_list(gm)
# Now that's stored for the holdout data, do this a faster way in java for the training data:
for predictor in to_impute:
d.impute(predictor, method='mean', by = ['time_period'], inplace = True)
print "Done imputing", predictor
print "Saving the final mean imputed data to disk..."
h2o.export_file(frame = d, path =saving_meanImputed_fp, force=True)
if method == "model":
# sequentially impute 'newdata', not 'data', so the order of the predictor variables in the loop does not matter
# otherwise, you would be using increasingly imputed data to make predictions as the loop progresses.
newdata = d
# With training data, build a model for each col and predict missing data, save the models, use them on the holdout data to predict all missing data.
for predictor in to_impute:
print "Building model for imputing " + predictor
print "Subsetting the data into missing values for predictor and no missing values for predictor"
na_ind = d[predictor].isna()
not_na_ind = na_ind != 1.0
to_train = d[not_na_ind]
to_predict = d[na_ind]
these_var = [var for var in predictors if var != predictor]
trained = h2o.gbm(x = to_train[these_var],
y = to_train[[predictor]],
ntrees=300,
max_depth=6,
learn_rate=0.2)
print "Saving the imputation tree model for " + predictor
h2o.save_model(trained, dir = saving_models_fp, name = "dl_imputation_model_" + predictor)
print "Imputing the missing " + predictor + " data by predicting with the model..."
predicted = trained.predict(to_predict[these_var])
tofillin = newdata[predictor]
assert len(predicted) == len(tofillin[na_ind])
tofillin[na_ind] = predicted # mutate the column in place
newdata[predictor] = tofillin
print "Saving the final model-imputed data to disk..."
h2o.export_file(frame = d, path =saving_modelImputed_fp, force=True)
def compare_frames(d1 = saving_meanImputed_fp,
d2 = saving_modelImputed_fp,
imputed = to_impute):
print "Comparing the resulting two matrices..."
# Load the saved frames back in
meanI = h2o.import_file(path = d1)
modelI = h2o.import_file(path = d2)
meanIquantiles = h2o.as_list(meanI[imputed].quantile(prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]))
modelIquantiles = h2o.as_list(modelI[imputed].quantile(prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]))
meanIcolmeans = [v.mean() for v in meanI[imputed]]
modelIcolmeans = [v.mean() for v in modelI[imputed]]
meanIcolmedians = [v.median() for v in meanI[imputed]]
modelIcolmedians = [v.median() for v in modelI[imputed]]
meanIcolmin = [v.min() for v in meanI[imputed]]
modelIcolmin = [v.min() for v in modelI[imputed]]
# TODO save all this in a csv file
impute_data("mean")
impute_data("model")
# compare_frames()
# Send email
email = False
if(email):
import smtplib
GMAIL_USERNAME = None
GMAIL_PW = None
RECIP = None
SMTP_NUM = None
session = smtplib.SMTP('smtp.gmail.com', SMTP_NUM)
session.ehlo()
session.starttls()
session.login(GMAIL_USERNAME, GMAIL_PW)
headers = "\r\n".join(["from: " + GMAIL_USERNAME,
"subject: " + "Finished running script: " + __file__,
"to: " + RECIP,
"mime-version: 1.0",
"content-type: text/html"])
content = headers + "\r\n\r\n" + "Done running the script.\n Sent from my Python code."
session.sendmail(GMAIL_USERNAME, RECIP, content)