### This Dev notebook builds simple OLS model to predict baby weight

### Outline:
1. Ingest data using BigQuery API.
2. Clean the data.
3. Build model.

In [1]:
project_name = 'My First Project'
project_id = 'valid-heuristic-369117'
regionn = 'us-west1'

In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
import time
import pickle
import os
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from google.cloud import bigquery, storage

# xgb.__version__
os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality')

In [3]:
time0 = time.time()

query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
display(df.shape, df.head())

(10000, 5)

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,8.348906,True,22,1,42.0
1,6.481591,False,20,1,41.0
2,9.792934,True,34,1,42.0
3,7.687519,True,28,1,39.0
4,7.687519,False,33,1,36.0


In [4]:
display(df.describe())
display(df.is_male.value_counts())

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks
count,9992.0,10000.0,10000.0,9916.0
mean,7.225465,27.4245,1.0321,38.581989
std,1.325989,6.238541,0.192024,2.518561
min,0.500449,12.0,1.0,17.0
25%,6.554343,22.0,1.0,38.0
50%,7.312733,27.0,1.0,39.0
75%,8.062305,32.0,1.0,40.0
max,13.944238,50.0,5.0,47.0


True     5100
False    4900
Name: is_male, dtype: int64

In [5]:
df['is_male'] = df['is_male'].astype(int)
df = df.dropna()
df = shuffle(df, random_state=2)

labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
X,y = data,labels
X_train, X_test, y_train, y_test = train_test_split(X, y)

display(X_train.shape, X_train.head(), X_test.shape, y_train.shape)

(7431, 4)

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
4504,1,21,1,38.0
2821,1,34,1,41.0
7196,0,32,1,40.0
7926,0,29,1,39.0
1046,0,18,1,41.0


(2477, 4)

(7431,)

In [6]:
time1 = time.time()
lr = LinearRegression()

lr.fit(X_train, y_train)
model_copied = lr

print(time.time()-time1)

0.02408909797668457


In [7]:
y_pred = lr.predict(X_test)

for i in range(5):
    print('Predicted weight: ', y_pred[i])
    print('Actual weight: ', y_test.iloc[i])
    print()
    
print('train rmse: ', np.sqrt(mean_squared_error(y_train, lr.predict(X_train))))
print('test rmse: ', np.sqrt(mean_squared_error(y_test, lr.predict(X_test))))

Predicted weight:  6.450807835011462
Actual weight:  6.0009827716399995

Predicted weight:  7.486949770282759
Actual weight:  8.1681268071

Predicted weight:  5.722665230881027
Actual weight:  5.3131405142

Predicted weight:  7.63261928202005
Actual weight:  7.698542189039999

Predicted weight:  6.361568083658476
Actual weight:  6.5807985207

train rmse:  1.0667677306568524
test rmse:  1.065659582005254


In [8]:
os.getcwd()

'/home/jupyter/project_repos/pg_natality/pg_natality'

In [9]:
artifact_filename_lm = 'lr_model.pkl'
joblib.dump(lr, artifact_filename_lm)

os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality/natality-app')
joblib.dump(lr, artifact_filename_lm)

os.chdir('/home/jupyter/project_repos/pg_natality/pg_natality')

# artifact_filename_xgb = 'xgb_model.json'
# xgbm.save_model(artifact_filename_xgb)

In [10]:
model_bucket = 'gs://pmykola-projectsgcp-artifacts/natality'
storage_path = os.path.join(model_bucket, artifact_filename_lm)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_lm)

In [11]:
file = open(artifact_filename_lm, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[0,:])])
print('lm', prediction)

lm [6.45080784]


  "X does not have valid feature names, but"
