### This Dev notebook builds simple OLS model to predict baby weight

### Outline:
1. Ingest data using BigQuery API, clean it.
2. Build a model.
3. Save model artifacts.
4. Test model artifacts.

## 1.

In [1]:
project_name = 'ML-projects-gen3'
project_id = 'polished-vault-379315'
app_path = '/home/jupyter/project_repos/natality/natality-app'
data_path = '/home/jupyter/projects_data/natality'
model_bucket = 'gs://mpg3-model-artifacts/natality'

In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
import time
import pickle
import os
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from xgboost import XGBRegressor
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 40)

# xgb.__version__
os.chdir('/home/jupyter/project_repos/natality/natality-app')

In [3]:
time0 = time.time()

query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 20000
"""
df = bigquery.Client().query(query).to_dataframe()
display(df.shape, df.head())

(20000, 5)

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,6.000983,True,45,1,39
1,3.811793,True,15,1,34
2,2.735937,True,15,1,34
3,5.291094,False,43,1,40
4,1.719606,True,42,1,26


In [4]:
# query="""
# SELECT *
# FROM
#   publicdata.samples.natality
# WHERE year > 2000
# LIMIT 10000
# """
# df = bigquery.Client().query(query).to_dataframe()
# display(df.shape, df.head())
# add month, father_age ?

In [5]:
display(df.describe())
display(df.is_male.value_counts())

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
count,19981.0,20000,20000.0,20000.0,19856.0
unique,,2,,,
top,,True,,,
freq,,10145,,,
mean,7.21305,,27.4552,1.0338,38.553384
std,1.325821,,6.186348,0.191205,2.560056
min,0.500449,,12.0,1.0,17.0
25%,6.563162,,23.0,1.0,38.0
50%,7.312733,,27.0,1.0,39.0
75%,8.035849,,32.0,1.0,40.0


True     10145
False     9855
Name: is_male, dtype: Int64

In [6]:
df['is_male'] = df['is_male'].astype(int)
df = df.dropna()
df = shuffle(df, random_state=2)

labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
X,y = data,labels
X_train, X_test, y_train, y_test = train_test_split(X, y)

display(X_train.shape, X_train.head(), X_test.shape, y_train.shape)

(14880, 4)

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
12957,1,30,1,36
8337,0,25,1,38
11761,0,29,1,30
17778,0,35,1,39
13775,1,31,1,38


(4961, 4)

(14880,)

## 2.

In [7]:
time1 = time.time()
lr = LinearRegression()
xgbm = XGBRegressor(n_estimators=100,
                  eta=0.1,
                  max_depth=4,
                  subsample=0.6,
                   colsample_bytree=0.5)

lr.fit(X_train, y_train)
xgbm.fit(X_train, y_train)
model_copied = lr

print(f'Modeling time: {time.time()-time1:.2f}')

Modeling time: 0.50


In [8]:
y_pred = lr.predict(X_test)

for i in range(1):
    print('Predicted weight: ', y_pred[i])
    print('Actual weight: ', y_test.iloc[i])
    print()
    
print(f'train lr rmse: {np.sqrt(mean_squared_error(y_train, lr.predict(X_train))):.3f}')
print(f'train xgb rmse: {np.sqrt(mean_squared_error(y_train, xgbm.predict(X_train))):.3f}')
print(f'test lr rmse: {np.sqrt(mean_squared_error(y_test, lr.predict(X_test))):.3f}')
print(f'test xgb rmse: {np.sqrt(mean_squared_error(y_test, xgbm.predict(X_test))):.3f}')
print(f'Total time: {time.time()-time0:.3f}')

Predicted weight:  7.856313701105398
Actual weight:  9.1271376468

train lr rmse: 1.064
train xgb rmse: 1.020
test lr rmse: 1.060
test xgb rmse: 1.029
Total time: 2.725


## 3.

In [9]:
print(os.getcwd())

artifact_filename_lm = 'lr_model.pkl'
artifact_filename_xgb = 'xgb_model.json'

# joblib.dump(lr, artifact_filename_lm)
# xgbm.save_model(artifact_filename_xgb)

# os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality/natality-app')
joblib.dump(lr, artifact_filename_lm)
xgbm.save_model(artifact_filename_xgb)

# os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality')

# artifact_filename_xgb = 'xgb_model.json'
# xgbm.save_model(artifact_filename_xgb)

/home/jupyter/project_repos/natality/natality-app


In [10]:
storage_path = os.path.join(model_bucket, artifact_filename_lm)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_lm)
storage_path = os.path.join(model_bucket, artifact_filename_xgb)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_xgb)

## 4.

In [11]:
file = open(artifact_filename_lm, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[0,:])])
print('lm', prediction)

lm [7.8563137]


  "X does not have valid feature names, but"
