### This Dev notebook builds simple OLS model to predict baby weight

### Outline:
1. Ingest data using BigQuery API, clean it.
2. Build a model.
3. Save model artifacts.
4. Test model artifacts.

## 1.

In [42]:
project_name = 'My First Project'
project_id = 'valid-heuristic-369117'
regionn = 'us-west1'
app_folder = '/home/jupyter/test_projects/test_natality/natality-app'

In [43]:
import pandas as pd
import xgboost as xgb
import numpy as np
import time
import pickle
import os
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from xgboost import XGBRegressor
from google.cloud import bigquery, storage

# xgb.__version__
os.chdir(app_folder)

In [44]:
time0 = time.time()

query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
display(df.shape, df.head())

(10000, 5)

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,3.875727,True,34,1,35
1,7.063611,False,30,1,37
2,7.749249,True,31,1,41
3,6.999677,False,40,1,39
4,7.936641,True,24,1,38


In [45]:
display(df.describe())
display(df.is_male.value_counts())

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
count,9991.0,10000,10000.0,10000.0,9939.0
unique,,2,,,
top,,True,,,
freq,,5098,,,
mean,7.195418,,27.2973,1.0352,38.52792
std,1.339473,,6.132917,0.193299,2.60939
min,0.500449,,13.0,1.0,17.0
25%,6.560957,,22.0,1.0,38.0
50%,7.312733,,27.0,1.0,39.0
75%,8.000575,,32.0,1.0,40.0


True     5098
False    4902
Name: is_male, dtype: Int64

In [46]:
df['is_male'] = df['is_male'].astype(int)
df = df.dropna()
df = shuffle(df, random_state=2)

labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
X,y = data,labels
X_train, X_test, y_train, y_test = train_test_split(X, y)

display(X_train.shape, X_train.head(), X_test.shape, y_train.shape)

(7450, 4)

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
608,1,27,1,40
803,1,24,2,36
7607,1,28,1,40
4132,0,28,1,41
579,1,25,1,38


(2484, 4)

(7450,)

## 2.

In [47]:
time1 = time.time()
lr = LinearRegression()
xgbm = XGBRegressor(n_estimators=100,
                  eta=0.1,
                  max_depth=4,
                  subsample=0.6)

lr.fit(X_train, y_train)
xgbm.fit(X_train, y_train)
model_copied = lr

print(time.time()-time1)

0.3740370273590088


In [48]:
y_pred = lr.predict(X_test)

for i in range(1):
    print('Predicted weight: ', y_pred[i])
    print('Actual weight: ', y_test.iloc[i])
    print()
    
print(f'train lr rmse: {np.sqrt(mean_squared_error(y_train, lr.predict(X_train)))}')
print(f'train xgb rmse: {np.sqrt(mean_squared_error(y_train, xgbm.predict(X_train)))}')
print(f'test lr rmse: {np.sqrt(mean_squared_error(y_test, lr.predict(X_test)))}')
print(f'test xgb rmse: {np.sqrt(mean_squared_error(y_test, xgbm.predict(X_test)))}')

Predicted weight:  6.507847814934973
Actual weight:  6.8122838958

train lr rmse: 1.0484199700629748
train xgb rmse: 0.9822201376204437
test lr rmse: 1.0566489303565303
test xgb rmse: 1.0268381173376158


## 3.

In [49]:
print(os.getcwd())

artifact_filename_lm = 'lr_model.pkl'
artifact_filename_xgb = 'xgb_model.json'

# joblib.dump(lr, artifact_filename_lm)
# xgbm.save_model(artifact_filename_xgb)

# os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality/natality-app')
joblib.dump(lr, artifact_filename_lm)
xgbm.save_model(artifact_filename_xgb)

# os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality')

# artifact_filename_xgb = 'xgb_model.json'
# xgbm.save_model(artifact_filename_xgb)

/home/jupyter/test_projects/test_natality/natality-app


In [50]:
model_bucket = 'gs://pmykola-testprojects/natality'
storage_path = os.path.join(model_bucket, artifact_filename_lm)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_lm)
storage_path = os.path.join(model_bucket, artifact_filename_xgb)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_xgb)

## 4.

In [51]:
file = open(artifact_filename_lm, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[0,:])])
print(f'''lm prediction: {prediction}. 
Total time is {time.time()-time0} sec''')

lm prediction: [6.50784781]. 
Total time is 3.472719192504883 sec


  "X does not have valid feature names, but"
