### This Dev notebook builds simple OLS model to predict baby weight

### Outline:
1. Ingest data using BigQuery API, clean it.
2. Build a model.
3. Save model artifacts.
4. Test model artifacts.

## 1.

In [1]:
project_name = 'ML-projects-gen3'
project_id = 'polished-vault-379315'
app_path = '/home/jupyter/project_repos/natality/natality-app'
data_path = '/home/jupyter/projects_data/natality'
model_bucket = 'gs://mpg3-model-artifacts/natality'

In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
import time
import pickle
import os
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from xgboost import XGBRegressor
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 40)

# xgb.__version__
os.chdir('/home/jupyter/project_repos/natality/natality-app')

In [3]:
time0 = time.time()

query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 20000
"""
df = bigquery.Client().query(query).to_dataframe()
display(df.shape, df.head())

(20000, 5)

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,5.749656,True,42,1,38.0
1,8.000575,True,15,1,39.0
2,4.387199,True,15,1,33.0
3,4.407041,False,15,2,29.0
4,6.437498,True,45,1,37.0


In [4]:
# query="""
# SELECT *
# FROM
#   publicdata.samples.natality
# WHERE year > 2000
# LIMIT 10000
# """
# df = bigquery.Client().query(query).to_dataframe()
# display(df.shape, df.head())
# add month, father_age ?

In [5]:
display(df.describe())
display(df.is_male.value_counts())

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks
count,19981.0,20000.0,20000.0,19880.0
mean,7.237626,27.379,1.0355,38.603622
std,1.321781,6.153158,0.197083,2.571074
min,0.500449,13.0,1.0,17.0
25%,6.591822,22.0,1.0,38.0
50%,7.319347,27.0,1.0,39.0
75%,8.062305,32.0,1.0,40.0
max,12.412025,49.0,4.0,47.0


True     10304
False     9696
Name: is_male, dtype: int64

In [6]:
df['is_male'] = df['is_male'].astype(int)
df = df.dropna()
df = shuffle(df, random_state=2)

labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
X,y = data,labels
X_train, X_test, y_train, y_test = train_test_split(X, y)

display(X_train.shape, X_train.head(), X_test.shape, y_train.shape)

(14897, 4)

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
18307,1,36,1,38.0
12729,0,29,1,34.0
19114,0,38,2,41.0
991,1,18,1,41.0
15796,0,33,1,40.0


(4966, 4)

(14897,)

## 2.

In [7]:
time1 = time.time()
lr = LinearRegression()
xgbm = XGBRegressor(n_estimators=100,
                  eta=0.1,
                  max_depth=4,
                  subsample=0.6,
                   colsample_bytree=0.5)

lr.fit(X_train, y_train)
xgbm.fit(X_train, y_train)
model_copied = lr

print(f'Modeling time: {time.time()-time1:.2f}')

Modeling time: 3.12


In [8]:
y_pred = lr.predict(X_test)

for i in range(1):
    print('Predicted weight: ', y_pred[i])
    print('Actual weight: ', y_test.iloc[i])
    print()
    
print(f'train lr rmse: {np.sqrt(mean_squared_error(y_train, lr.predict(X_train))):.3f}')
print(f'train xgb rmse: {np.sqrt(mean_squared_error(y_train, xgbm.predict(X_train))):.3f}')
print(f'test lr rmse: {np.sqrt(mean_squared_error(y_test, lr.predict(X_test))):.3f}')
print(f'test xgb rmse: {np.sqrt(mean_squared_error(y_test, xgbm.predict(X_test))):.3f}')
print(f'Total time: {time.time()-time0:.3f}')

Predicted weight:  7.719671079905832
Actual weight:  7.25100379718

train lr rmse: 1.053
train xgb rmse: 1.005
test lr rmse: 1.061
test xgb rmse: 1.022
Total time: 6.358


## 3.

In [9]:
print(os.getcwd())

artifact_filename_lm = 'lr_model.pkl'
artifact_filename_xgb = 'xgb_model.json'

# joblib.dump(lr, artifact_filename_lm)
# xgbm.save_model(artifact_filename_xgb)

# os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality/natality-app')
joblib.dump(lr, artifact_filename_lm)
xgbm.save_model(artifact_filename_xgb)

# os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality')

# artifact_filename_xgb = 'xgb_model.json'
# xgbm.save_model(artifact_filename_xgb)

/home/jupyter/project_repos/natality/natality-app


In [10]:
storage_path = os.path.join(model_bucket, artifact_filename_lm)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_lm)
storage_path = os.path.join(model_bucket, artifact_filename_xgb)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_xgb)

## 4.

In [28]:
file = open(artifact_filename_lm, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[1,:])])
print('lm', prediction)

lm [7.11445051]


  "X does not have valid feature names, but"


In [40]:
# test predictions on feature dictionary:
feature_dictionary = dict(X_test.iloc[0,:])
feature_series = pd.Series(feature_dictionary)
print(type(feature_dictionary), feature_dictionary)

<class 'dict'> {'is_male': 1.0, 'mother_age': 25.0, 'plurality': 1.0, 'gestation_weeks': 40.0}


In [42]:
trained_model.predict([feature_series])[0]
# this code works for sklearn models, but not for xgb...

  "X does not have valid feature names, but"


7.71967107990583

In [44]:
X_train

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
18307,1,36,1,38.0
12729,0,29,1,34.0
19114,0,38,2,41.0
991,1,18,1,41.0
15796,0,33,1,40.0
...,...,...,...,...
10155,1,27,2,40.0
10223,1,27,1,37.0
17846,1,35,1,41.0
9545,1,26,1,36.0


In [45]:
!ls -a

.	       .ipynb_checkpoints  main.py	     templates
..	       app.yaml		   main_old.py	     xgb_model.json
.gcloudignore  lr_model.pkl	   requirements.txt


In [46]:
trained_model = XGBRegressor()
trained_model.load_model("xgb_model.json")
# features must be pd.Series of one observation
prediction = trained_model.predict([pd.Series(feature_dictionary)])
prediction[0]

ValueError: training data did not have the following fields: is_male, mother_age, plurality, gestation_weeks

In [71]:
feature_df = pd.DataFrame.from_dict(feature_dictionary,orient='index').T
trained_model.predict(feature_df)[0]

7.7680244