In [None]:
# Import packages
## Warnings
import warnings
## Prepare and explore data
import pandas as pd # Data manipulation
import numpy as np # Numeric computations
import matplotlib.pyplot as plt # Data visualisation
import seaborn as sns # Data visualisation

## Machine learning packages
from sklearn.model_selection import train_test_split # Split training and test data
from sklearn.metrics import mean_absolute_error, mean_squared_error # Model evaluation
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet #Linear models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # Ensemble models
from sklearn.svm import SVC # Support vector machine
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.tree import DecisionTreeRegressor # Decision Tree
from xgboost import XGBRegressor # Extreme gradient boostng
from category_encoders import OneHotEncoder # One hot encoder
from sklearn.pipeline import Pipeline, make_pipeline # Pipeline

warnings.simplefilter(action="ignore", category=FutureWarning)

# Prepare Data
## Import 

In [None]:
# Import dataset
df = pd.read_csv("data/height_prediction.csv")

## Explore

In [None]:
# Dimensions of the dataset
df.shape

In [None]:
# Dataset structure
df.info()

In [None]:
# View first five rows
df.head()

## Split

In [None]:
# Specify features
features = ["age", "gender", "mean_ulna"]

# Specify target vector
target = "mean_height"

# Subset features
X = df[features]

# Subset target
y = df[target]

# Split dataset into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

# Inspect split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y_train.head()

# Build model
## Baseline

In [None]:
# Mean of the target
y_mean = y_train.mean()

# Generate baseline predictions
y_pred_baseline = [y_mean] * len(y_train)

# Evaluate baseline predictions
## Mean absolute error
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Baseline mean absolute error", mae_baseline)

## Mean squared error
mse_baseline = mean_squared_error(y_train, y_pred_baseline)
print("Baseline mean squared error", mse_baseline)

## Root mean squared error
rmse_baseline = np.sqrt(mean_squared_error(y_train, y_pred_baseline))
print("Baseline root mean squared error", rmse_baseline)

## Iterate

### Linear regression

In [None]:
# Build model
lr_model = make_pipeline(
    OneHotEncoder(),
    LinearRegression()
)

# Fit model
lr_model.fit(X_train, y_train)

# Predict for training data
y_pred_linear = lr_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_linear = mean_absolute_error(y_train, y_pred_linear)
print("Training mean absolute error (Linear)", mae_linear)

# Mean squared error
mse_linear = mean_squared_error(y_train, y_pred_linear)
print("Training mean squared error (Linear)", mse_linear)

# Root mean squared error
rmse_linear = np.sqrt(mean_squared_error(y_train, y_pred_linear))
print("Training root mean squared error (Linear)", rmse_linear)

### Ridge regression


In [None]:
# Build model
ridge = make_pipeline(
    OneHotEncoder(),
    Ridge()
)

# Fit model
ridge.fit(X_train, y_train)

# Predict for training data
y_pred_ridge = ridge.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_ridge = mean_absolute_error(y_train, y_pred_ridge)
print("Training mean absolute error (Ridge)", mae_ridge)

# Mean squared error
mse_ridge = mean_squared_error(y_train, y_pred_ridge)
print("Training mean squared error (Ridge)", mse_ridge)

# Root mean squared error
rmse_ridge = np.sqrt(mean_squared_error(y_train, y_pred_ridge))
print("Training root mean squared error (Ridge)", rmse_ridge)

### Lasso

In [None]:
# Build model
lasso = make_pipeline(
    OneHotEncoder(),
    Lasso()
)

# Fit model
lasso.fit(X_train, y_train)

# Predict for training data
y_pred_lasso = lasso.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_lasso = mean_absolute_error(y_train, y_pred_lasso)
print("Training mean absolute error (Lasso)", mae_lasso)

# Mean squared error
mse_lasso = mean_squared_error(y_train, y_pred_lasso)
print("Training mean squared error (Lasso)", mse_lasso)

# Root mean squared error
rmse_lasso = np.sqrt(mean_squared_error(y_train, y_pred_lasso))
print("Training root mean squared error (Lasso)", rmse_lasso)

In [None]:
# Build model
elastic = make_pipeline(
    OneHotEncoder(),
    ElasticNet()
)

# Fit model
elastic.fit(X_train, y_train)

# Predict for training data
y_pred_elastic = elastic.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_elastic = mean_absolute_error(y_train, y_pred_elastic)
print("Training mean absolute error (ElasticNet)", mae_elastic)

# Mean squared error
mse_elastic = mean_squared_error(y_train, y_pred_elastic)
print("Training mean squared error (ElasticNet)", mse_elastic)

# Root mean squared error
rmse_elastic = np.sqrt(mean_squared_error(y_train, y_pred_elastic))
print("Training root mean squared error (ElasticNet)", rmse_elastic)

### Random Forest


In [None]:
# Build model
rf_model = make_pipeline(
    OneHotEncoder(),
    RandomForestRegressor(random_state=42)
)

# Fit model
rf_model.fit(X_train, y_train)

# Predict for training data
y_pred_rf = rf_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_rf = mean_absolute_error(y_train, y_pred_rf)
print("Training mean absolute error (Random Forest)", mae_rf)

# Mean squared error
mse_rf = mean_squared_error(y_train, y_pred_rf)
print("Training mean squared error (Random Forest)", mse_rf)

# Root mean squared error
rmse_rf = np.sqrt(mean_squared_error(y_train, y_pred_rf))
print("Training root mean squared error (Random Forest)", rmse_rf)

### Gradient Boosting

In [None]:
# Build model
gb_model = make_pipeline(
    OneHotEncoder(),
    GradientBoostingRegressor(random_state=42)
)

# Fit model
gb_model.fit(X_train, y_train)

# Predict for training data
y_pred_gb = gb_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_gb = mean_absolute_error(y_train, y_pred_gb)
print("Training mean absolute error (Gradient Boosting)", mae_gb)

# Mean squared error
mse_gb = mean_squared_error(y_train, y_pred_gb)
print("Training mean squared error (Gradient Boosting)", mse_gb)

# Root mean squared error
rmse_gb = np.sqrt(mean_squared_error(y_train, y_pred_gb))
print("Training root mean squared error (Gradient Boosting)", rmse_gb)

### Extreme Gradient Boosting

In [None]:
# Build model
xgb_model = make_pipeline(
    OneHotEncoder(),
    XGBRegressor()
)

# Fit model
xgb_model.fit(X_train, y_train)

# Predict for training data
y_pred_xgb = xgb_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_xgb = mean_absolute_error(y_train, y_pred_xgb)
print("Training mean absolute error (Extreme Gradient)", mae_xgb)

# Mean squared error
mse_xgb = mean_squared_error(y_train, y_pred_xgb)
print("Training mean squared error (Extreme Gradient)", mse_xgb)

# Root mean squared error
rmse_xgb = np.sqrt(mean_squared_error(y_train, y_pred_xgb))
print("Training root mean squared error (Extreme Gradient)", rmse_xgb)

### Decision tree

In [None]:
# Build model
tree_model = make_pipeline(
    OneHotEncoder(),
    DecisionTreeRegressor(random_state=42)
)

# Fit model
tree_model.fit(X_train, y_train)

# Predict for training data
y_pred_tree = tree_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_tree = mean_absolute_error(y_train, y_pred_tree)
print("Training mean absolute error (Decision Tree)", mae_tree)

# Mean squared error
mse_tree = mean_squared_error(y_train, y_pred_tree)
print("Training mean squared error (Decision Tree)", mse_tree)

# Root mean squared error
rmse_tree = np.sqrt(mean_squared_error(y_train, y_pred_tree))
print("Training root mean squared error (Decision Tree)", rmse_tree)

# Evaluate

# Communicate