In [1]:
# Uploading the necessary CSV
from google.colab import files
uploaded = files.upload()

Saving nhanes_all.csv to nhanes_all (1).csv


### Hyperopt / Bayesian Optimization don't come standard in Colab notebooks, just do a quick pip install of them.

In [3]:
!pip install hyperopt
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp36-none-any.whl size=11685 sha256=2728d2b948c8273d2e01ca6c1b4f5a4122cf982c8b5da01ddd8b4d06a838a675
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [4]:
# Joblib is for saving our models
import joblib
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ML Packages
import hyperopt
from bayes_opt import BayesianOptimization
from sklearn.metrics import r2_score
import warnings
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
warnings.filterwarnings('ignore')

In [5]:
# Mounting our drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### The complex surveying method the CDC uses in it's acquisition of data means that each individual has a specific weight tied to them, which indicates the number of people that they represent in the total non-institutionalized population of the US
### We can use that weight, and pandas built in sampling function to upsample our data from 16,000, to 330,000, which should help our model (it's kind of like assigning weights to the data even before we start modeling)

In [6]:
# Upsampling using "Proper Weight" column
df = pd.read_csv('nhanes_all.csv')
df = df.sample(frac=21, replace=True, weights='Proper Weight', random_state=42)
# Drop 0 rows
df = df[(df!=0).all(1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 301993 entries, 5911 to 714
Data columns (total 17 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   SEQN           301993 non-null  int64  
 1   RIAGENDR       301993 non-null  int64  
 2   RIDAGEYR       301993 non-null  int64  
 3   BMXWT          301993 non-null  float64
 4   BMXHT          301993 non-null  float64
 5   BMXBMI         301993 non-null  float64
 6   BMXWAIST       301993 non-null  float64
 7   BPSYAVG        301993 non-null  int64  
 8   BPDIAVG        301993 non-null  int64  
 9   LBDHDD         301993 non-null  int64  
 10  LBXSGL         301993 non-null  int64  
 11  LBXGH          301993 non-null  float64
 12  LBXGLU         301993 non-null  int64  
 13  LBXTR          301993 non-null  int64  
 14  LBDLDL         301993 non-null  int64  
 15  LBXAPB         301993 non-null  int64  
 16  Proper Weight  301993 non-null  float64
dtypes: float64(6), int64(11)
memo

In [7]:
df = df.set_index('SEQN')

In [8]:
df = df.drop(columns='Proper Weight')

In [10]:
df.head(10)

Unnamed: 0_level_0,RIAGENDR,RIDAGEYR,BMXWT,BMXHT,BMXBMI,BMXWAIST,BPSYAVG,BPDIAVG,LBDHDD,LBXSGL,LBXGH,LBXGLU,LBXTR,LBDLDL,LBXAPB
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
62363,2,44,64.2,157.6,25.8,89.7,109,61,45,79,4.9,89,192,136,110
86236,1,34,131.1,177.3,41.7,123.8,145,58,53,97,5.2,108,104,166,127
89103,1,64,92.6,170.6,31.8,108.2,133,66,42,179,9.6,191,93,114,93
49299,1,51,93.6,179.1,29.18,105.5,113,68,43,88,6.1,102,96,121,96
71751,2,42,62.0,175.6,20.1,74.5,131,68,62,76,4.9,85,198,60,65
47711,2,29,93.5,161.2,35.98,105.8,110,62,55,77,4.8,85,68,117,72
85338,1,43,92.2,177.6,29.2,102.7,117,80,38,96,5.3,100,40,75,63
81975,2,48,77.8,157.6,31.3,102.0,111,78,63,91,5.2,94,150,136,108
93093,2,45,107.6,155.3,44.6,127.7,136,83,54,147,5.1,101,80,131,96
33020,1,39,104.6,173.9,34.59,117.4,135,80,62,127,5.2,134,252,108,97


In [11]:
# Labels are the values we want to predict
labels = df[['BMXWAIST','BPSYAVG','BPDIAVG','LBDHDD','LBXSGL','LBXGH','LBXGLU','LBXTR','LBDLDL','LBXAPB']]
labels_list = list(labels.columns)
# Features are used to predict labels
features= df[['RIAGENDR','RIDAGEYR','BMXWT','BMXHT','BMXBMI']]
feature_list = list(features.columns)


### We're going to use both the bayesian optimization, and hypopt packages
#### Since Light GBM regressor models can only predict one outcome variable per iteration, we need to create some looping so that it goes through each of our ten labels

In [None]:
def bayesion_opt_lgbm(X, y, init_iter=3, n_iters=7, random_state=11, seed = 101, num_iterations = 100):
  dtrain = lgb.Dataset(data=X, label=y)
  def lgb_r2_score(preds, dtrain):
      labels = dtrain.get_label()
      return 'r2', r2_score(labels, preds), True
  # Objective Function
  def hyp_lgbm(num_leaves, feature_fraction, bagging_fraction, max_depth, min_split_gain, min_child_weight):
        
          params = {'application':'regression','num_iterations': num_iterations,
                    'learning_rate':0.05, 'early_stopping_round':50,
                    'metric':'lgb_r2_score'} # Default parameters
          params["num_leaves"] = int(round(num_leaves))
          params['feature_fraction'] = max(min(feature_fraction, 1), 0)
          params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
          params['max_depth'] = int(round(max_depth))
          params['min_split_gain'] = min_split_gain
          params['min_child_weight'] = min_child_weight
          cv_results = lgb.cv(params, dtrain, nfold=5, seed=seed,categorical_feature=[], stratified=False,
                              verbose_eval =None, feval=lgb_r2_score)
          # print(cv_results)
          return np.max(cv_results['r2-mean'])
  # Domain space-- Range of hyperparameters 
  pds = {'num_leaves': (80, 100),
            'feature_fraction': (0.1, 0.9),
            'bagging_fraction': (0.8, 1),
            'max_depth': (17, 25),
            'min_split_gain': (0.001, 0.1),
            'min_child_weight': (10, 25)
            }

  # Surrogate model
  optimizer = BayesianOptimization(hyp_lgbm, pds, random_state=random_state)
                                    
  # Optimize
  optimizer.maximize(init_points=init_iter, n_iter=n_iters)



### With our functoins completed, we can iterate over the labels we want to predict with a simple loop

In [None]:
for i in labels_list:
  bayesion_opt_lgbm(features, labels[i], init_iter=5, n_iters=10, random_state=77, seed = 101, num_iterations = 200)

|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9524  [0m | [0m 0.9838  [0m | [0m 0.6138  [0m | [0m 23.03   [0m | [0m 12.09   [0m | [0m 0.009645[0m | [0m 95.76   [0m |
| [0m 2       [0m | [0m 0.9439  [0m | [0m 0.8652  [0m | [0m 0.5329  [0m | [0m 18.92   [0m | [0m 18.18   [0m | [0m 0.04065 [0m | [0m 94.3    [0m |
| [0m 3       [0m | [0m 0.9427  [0m | [0m 0.9673  [0m | [0m 0.5708  [0m | [0m 19.37   [0m | [0m 14.22   [0m | [0m 0.07085 [0m | [0m 88.45   [0m |
| [0m 4       [0m | [0m 0.9499  [0m | [0m 0.8115  [0m | [0m 0.6976  [0m | [0m 20.62   [0m | [0m 12.64   [0m | [0m 0.005888[0m | [0m 85.85   [0m |
| [0m 5       [0m | [0m 0.9491  [0m | [0m 0.8134  [0m | [0m 0.7009  [0m | [0m 17.51   [0m | [0m 16.48   [0m | [0m 0.03705 [0m | [0m 83.04   

### Here's the optimized hyperparameters for each of the models we're going to construct

In [12]:
hyper_params = {'BMXWAIST':{
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.07412,
    'learning_rate': 0.005,
    'feature_fraction': 0.881,
    'bagging_fraction': 0.8013,
    "max_depth": 25,
    "num_leaves": 98,
    'min_child_weight':16.89,
    "num_iterations": 50000,
    "n_estimators": 10000}, 
    'BPSYAVG':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':16.91,
    "num_iterations": 50000,
    "n_estimators": 10000},
    'BPDIAVG':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':16.89,
    "num_iterations": 50000,
    "n_estimators": 10000},
    'LBDHDD':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':25,
    "num_iterations": 50000,
    "n_estimators": 10000},
    'LBXSGL':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':16.85,
    "num_iterations": 50000,
    "n_estimators": 10000},
    'LBXGH':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':16.96,
    "num_iterations": 50000,
    "n_estimators": 10000},    
    'LBXGLU':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':16.97,
    "num_iterations": 50000,
    "n_estimators": 10000},
    'LBXTR':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':25,
    "num_iterations": 50000,
    "n_estimators": 10000},    
    'LBDLDL':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':16.97,
    "num_iterations": 50000,
    "n_estimators": 10000},
    'LBXAPB':{'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'min_split_gain':0.001,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.80,
    "max_depth": 25,
    "num_leaves": 100,
    'min_child_weight':16.87,
    "num_iterations": 50000,
    "n_estimators": 10000}}


#### - Using the hyper parameters we got through Bayesian Optimization, we loop through the labels we want to predict, using Light GBM as our model.
#### - After each loop, we save the model so we can use it later with the file name lgb_reg_(label)
#### - Gradient boosting models use decision trees, just leaf wise rather than tree level wise, but as a rule of thumb, decision trees do not require scaling prior to fitting, 
#### - The differences between the scales of data help with fitting the trees, and deciding on weighting for features.
### Light GBM is a super efficient gradient boosting model, so this code only took a couple hours to process in colab's environment

In [None]:
for i in labels_list:
  gbm = lgb.LGBMRegressor(**hyper_params[i])
  X = df[['RIAGENDR','RIDAGEYR','BMXWT','BMXHT','BMXBMI']]
  y = df[i]
  X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
  gbm.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='mae',
        early_stopping_rounds=1000)
  import joblib
  path = "/content/gdrive/My Drive/lgb_reg_{0}.pickle".format(i)
  # save model
  joblib.dump(gbm, path)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[45003]	valid_0's l2: 6.64745	valid_0's l1: 0.903996	valid_0's l1: 0.903996
[45004]	valid_0's l2: 6.64713	valid_0's l1: 0.903968	valid_0's l1: 0.903968
[45005]	valid_0's l2: 6.64696	valid_0's l1: 0.903934	valid_0's l1: 0.903934
[45006]	valid_0's l2: 6.6468	valid_0's l1: 0.903901	valid_0's l1: 0.903901
[45007]	valid_0's l2: 6.64657	valid_0's l1: 0.903864	valid_0's l1: 0.903864
[45008]	valid_0's l2: 6.64624	valid_0's l1: 0.903837	valid_0's l1: 0.903837
[45009]	valid_0's l2: 6.64601	valid_0's l1: 0.903808	valid_0's l1: 0.903808
[45010]	valid_0's l2: 6.64576	valid_0's l1: 0.903772	valid_0's l1: 0.903772
[45011]	valid_0's l2: 6.64549	valid_0's l1: 0.903744	valid_0's l1: 0.903744
[45012]	valid_0's l2: 6.64513	valid_0's l1: 0.903691	valid_0's l1: 0.903691
[45013]	valid_0's l2: 6.64491	valid_0's l1: 0.903658	valid_0's l1: 0.903658
[45014]	valid_0's l2: 6.64456	valid_0's l1: 0.903622	valid_0's l1: 0.903622
[45015]	valid_0's l2: 6.

In [13]:
# Creating a list of the models saved to Drive so we can predict on test data later
model_name_list = []
for i in labels_list:
  model_name_list.append('lgb_reg_{0}.pickle'.format(i))

# BMXWAIST = 'lgb_reg_BMXWAIST.pt'
# path = F"/content/gdrive/My Drive/{model_save_name}"
# model.load_state_dict(torch.load(path))

In [17]:
# Upload whatever data you want to predict on
from google.colab import files
uploaded = files.upload()

Saving final_test.csv to final_test.csv


In [18]:
# Save to a pandas dataframe
test_data = pd.read_csv('final_test.csv')

In [19]:
test_data.head(5)

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,BMXWT,BMXHT,BMXBMI,BMXWAIST,BPSYAVG,BPDIAVG,LBDHDD,LBXSGL,LBXGH,LBXGLU
0,95733,1,22,53.7,185.1,15.7,0.0,0,0,0,0,0.0,0
1,94895,2,23,48.5,173.5,16.1,0.0,101,72,0,0,0.0,0
2,97324,2,19,42.3,159.7,16.6,0.0,101,64,0,0,0.0,0
3,95378,2,80,44.6,163.2,16.7,0.0,164,59,0,0,0.0,0
4,100556,2,65,46.5,162.9,17.5,0.0,119,69,61,229,12.0,241


In [20]:
test_data = test_data.set_index('SEQN')

In [21]:
test_data.head(5)

Unnamed: 0_level_0,RIAGENDR,RIDAGEYR,BMXWT,BMXHT,BMXBMI,BMXWAIST,BPSYAVG,BPDIAVG,LBDHDD,LBXSGL,LBXGH,LBXGLU
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
95733,1,22,53.7,185.1,15.7,0.0,0,0,0,0,0.0,0
94895,2,23,48.5,173.5,16.1,0.0,101,72,0,0,0.0,0
97324,2,19,42.3,159.7,16.6,0.0,101,64,0,0,0.0,0
95378,2,80,44.6,163.2,16.7,0.0,164,59,0,0,0.0,0
100556,2,65,46.5,162.9,17.5,0.0,119,69,61,229,12.0,241


In [23]:
test_data = test_data[(test_data!=0).all(1)]
test_data.head(5)

Unnamed: 0_level_0,RIAGENDR,RIDAGEYR,BMXWT,BMXHT,BMXBMI,BMXWAIST,BPSYAVG,BPDIAVG,LBDHDD,LBXSGL,LBXGH,LBXGLU
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
98661,2,31,44.6,168.5,15.7,63.2,96,59,57,88,4.9,93
102689,1,24,48.4,172.5,16.3,64.0,110,73,48,100,5.4,108
102112,2,33,42.3,161.5,16.2,64.5,107,78,59,82,5.2,93
95741,2,19,64.3,163.2,24.1,64.5,96,64,69,86,5.0,89
101799,2,28,47.0,168.7,16.5,64.6,103,70,73,85,5.1,94


In [32]:
test = test_data.iloc[:, :5]

In [34]:
test.head()

Unnamed: 0_level_0,RIAGENDR,RIDAGEYR,BMXWT,BMXHT,BMXBMI
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
98661,2,31,44.6,168.5,15.7
102689,1,24,48.4,172.5,16.3
102112,2,33,42.3,161.5,16.2
95741,2,19,64.3,163.2,24.1
101799,2,28,47.0,168.7,16.5


In [33]:
predictions = pd.DataFrame(index=test_data.index)

In [28]:
test_actual = test_data[test_data.columns[-7:]]

In [29]:
test_actual.head()

Unnamed: 0_level_0,BMXWAIST,BPSYAVG,BPDIAVG,LBDHDD,LBXSGL,LBXGH,LBXGLU
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
98661,63.2,96,59,57,88,4.9,93
102689,64.0,110,73,48,100,5.4,108
102112,64.5,107,78,59,82,5.2,93
95741,64.5,96,64,69,86,5.0,89
101799,64.6,103,70,73,85,5.1,94


In [30]:
test_actual_labels = list(test_actual.columns)

In [31]:
test_actual_labels

['BMXWAIST', 'BPSYAVG', 'BPDIAVG', 'LBDHDD', 'LBXSGL', 'LBXGH', 'LBXGLU']

### Here's some test data from NHANES 2017-2018. Unfortunately, they don't have all of the necessary columns, missing the last three columns:
- LBXTR	
- LBDLDL	
- LBXAPB

So we just create a new list of label names from the imported data, and run predictions on the 7 labels that we do have, so we can compare actual and predicted.

If you want to run it on data where you aren't comparing, you'd just use something like this:

    for i in labels_list:
        model = joblib.load("/content/gdrive/My Drive/lgb_reg_{0}.pickle".format(i))
        test[i] = model.predict(test.iloc[:, :5])

Where labels_list is the full list of 10 labels you want to predict.

Also, just as a side note, I didn't think it was necessary to Flask this and set it up as an API you could call, as you wouldn't need to call this constantly. 

If you did want to do that, you could write a fairly simple script to Flask it, then set it up to run on an Amazon AWS server for example, so you could call it whenver you needed it.
But as it's something you'd more likely run a couple times a year maybe, when new batches of individuals first come onto the program, I didn't include that here.

In [35]:
for i in test_actual_labels:
  model = joblib.load("/content/gdrive/My Drive/lgb_reg_{0}.pickle".format(i))
  test[i] = model.predict(test.iloc[:, :5])

In [36]:
test

Unnamed: 0_level_0,RIAGENDR,RIDAGEYR,BMXWT,BMXHT,BMXBMI,BMXWAIST,BPSYAVG,BPDIAVG,LBDHDD,LBXSGL,LBXGH,LBXGLU
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
98661,2,31,44.6,168.5,15.7,67.043692,103.334585,69.554171,71.266787,81.270587,5.188691,86.817338
102689,1,24,48.4,172.5,16.3,68.173066,108.504179,64.521785,41.601159,92.504308,5.062580,99.097287
102112,2,33,42.3,161.5,16.2,61.824537,97.410864,67.400558,74.984547,80.755633,5.191240,92.804786
95741,2,19,64.3,163.2,24.1,80.725581,112.817642,61.857137,51.562405,91.145990,5.281050,88.089256
101799,2,28,47.0,168.7,16.5,67.723143,93.408719,61.907345,64.631452,81.874273,5.119572,85.293489
...,...,...,...,...,...,...,...,...,...,...,...,...
96587,1,30,191.4,175.8,61.9,151.048138,142.586554,89.345503,34.404655,113.660924,5.984552,118.814349
99616,2,39,179.2,168.0,63.5,162.443780,130.318989,78.100557,52.254562,111.223387,5.830912,111.119679
95431,2,66,160.1,161.2,61.6,141.173146,141.436519,64.277838,55.669942,146.900081,6.504632,143.739163
102082,1,58,174.0,190.2,48.1,147.620145,117.929060,59.595494,49.891119,148.455930,7.281432,154.856863


In [None]:
test_data[test_data.columns[-7:]]

In [40]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(test_actual['BMXWAIST'], test['BMXWAIST'])

4.47679232181534

In [41]:
mean_absolute_error(test_actual['BPSYAVG'], test['BPSYAVG'])



14.014879909595933

In [42]:
mean_absolute_error(test_actual['BPDIAVG'], test['BPDIAVG'])

9.799294845662258

In [None]:
mean_absolute_error(test_actual['BPDIAVG'], test['BPDIAVG'])

In [43]:
mean_absolute_error(test_actual['LBDHDD'], test['LBDHDD'])


11.484834951268782

In [44]:
mean_absolute_error(test_actual['LBXSGL'], test['LBXSGL'])

20.20388295973544

In [45]:
mean_absolute_error(test_actual['LBXGH'], test['LBXGH'])

0.6709534554123807

In [46]:
mean_absolute_error(test_actual['LBXGLU'], test['LBXGLU'])

21.167345129814304

Unnamed: 0,BMXBMI,BMXWAIST,BPSYAVG,BPDIAVG,LBDHDD,LBXSGL,LBXGH,LBXGLU,LBXTR,LBDLDL,LBXAPB
0,18.0,71.0,128.0,74.0,63.0,109.0,6.0,107.0,85.0,91.0,81.0
1,20.0,72.0,99.0,70.0,60.0,87.0,5.0,95.0,138.0,137.0,116.0
2,17.0,66.0,110.0,74.0,71.0,88.0,5.0,96.0,52.0,100.0,72.0
3,20.0,73.0,101.0,57.0,75.0,80.0,5.0,86.0,65.0,91.0,75.0
4,18.0,71.0,91.0,63.0,64.0,91.0,5.0,91.0,116.0,95.0,79.0
...,...,...,...,...,...,...,...,...,...,...,...
995,28.0,98.0,119.0,73.0,58.0,105.0,6.0,109.0,127.0,142.0,111.0
996,29.0,105.0,119.0,65.0,55.0,144.0,8.0,159.0,171.0,200.0,143.0
997,30.0,103.0,137.0,63.0,65.0,104.0,6.0,109.0,154.0,111.0,87.0
998,21.0,85.0,115.0,68.0,52.0,112.0,5.0,135.0,101.0,110.0,88.0


In [None]:
test_data.to_csv('/content/gdrive/My Drive/test_data_with_predictions.csv')