### This Dev notebook builds simple XGB model to predict baby weight

### Outline:
1. Ingest data using BigQuery API.
2. Clean the data.
3. Build model.

In [1]:
project_name = 'My First Project'
project_id = 'quantum-keep-360100'
regionn = 'us-central1'

In [2]:
import pandas as pd
from xgboost import XGBRegressor
import numpy as np
import time
import pickle
import os

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from google.cloud import bigquery, storage

# xgb.__version__
os.chdir('/home/jupyter/projects_gcp')

In [3]:
import pandas as pd
import xgboost as xgb
import numpy as np
import time
import pickle
import os

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from google.cloud import bigquery, storage

# xgb.__version__
os.chdir('/home/jupyter/projects_gcp')

In [4]:
time0 = time.time()

query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
display(df.shape, df.head())

(10000, 5)

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,9.312326,False,28,1,40.0
1,7.749249,True,30,1,40.0
2,7.394304,True,27,1,39.0
3,6.750554,False,40,1,41.0
4,8.377566,True,24,1,38.0


In [5]:
display(df.describe())
display(df.is_male.value_counts())

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks
count,9984.0,10000.0,10000.0,9935.0
mean,7.207753,27.3949,1.0329,38.545848
std,1.325683,6.142551,0.187673,2.561521
min,0.639341,13.0,1.0,17.0
25%,6.563162,23.0,1.0,38.0
50%,7.312733,27.0,1.0,39.0
75%,8.035849,32.0,1.0,40.0
max,12.50021,47.0,4.0,47.0


True     5129
False    4871
Name: is_male, dtype: int64

In [6]:
df['is_male'] = df['is_male'].astype(int)
df = df.dropna()
df = shuffle(df, random_state=2)

labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
x,y = data,labels
X_train, X_test, y_train, y_test = train_test_split(x,y)

display(X_train.shape, X_train.head(), X_test.shape, y_train.shape)

(7442, 4)

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
3037,1,34,1,34.0
3383,0,29,1,37.0
8685,1,30,1,36.0
3650,0,26,1,40.0
294,1,31,1,40.0


(2481, 4)

(7442,)

In [7]:
time1 = time.time()
model = XGBRegressor(tree_method = 'gpu_hist')

model.fit(X_train, y_train)
model_copied = model

print(time.time()-time1)

0.8024663925170898


In [10]:
y_pred = model.predict(X_test)

for i in range(5):
    print('Predicted weight: ', y_pred[i])
    print('Actual weight: ', y_test.iloc[i])
    print()
    
print('train rmse: ', np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
print('test rmse: ', np.sqrt(mean_squared_error(y_test, model.predict(X_test))))

Predicted weight:  7.53581
Actual weight:  5.06181353552

Predicted weight:  7.613289
Actual weight:  6.4992274837599995

Predicted weight:  7.918761
Actual weight:  7.936641432

Predicted weight:  4.895279
Actual weight:  4.87442061282

Predicted weight:  7.4103384
Actual weight:  8.437090766739999

train rmse:  0.9519739646864473
test rmse:  1.0780612860897323
