In [1]:
# Import packages
## Warnings
import warnings
## Prepare and explore data
import pandas as pd # Data manipulation
import numpy as np # Numeric computations
import matplotlib.pyplot as plt # Data visualisation
import seaborn as sns # Data visualisation

## Machine learning packages
from sklearn.model_selection import train_test_split, GridSearchCV # Split training and test data, Cross validation
from sklearn.metrics import mean_absolute_error, mean_squared_error # Model evaluation
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet #Linear models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # Ensemble models
from sklearn.svm import SVC # Support vector machine
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.tree import DecisionTreeRegressor # Decision Tree
from xgboost import XGBRegressor # Extreme gradient boostng
from category_encoders import OneHotEncoder, OrdinalEncoder # Categorical encoder
from sklearn.pipeline import Pipeline, make_pipeline # Pipeline

warnings.simplefilter(action="ignore", category=FutureWarning)

# Prepare Data
## Import 

In [2]:
# Import dataset
df = pd.read_csv("data/height_prediction.csv")

## Explore

In [3]:
# Dimensions of the dataset
df.shape

(371, 4)

In [4]:
# Dataset structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          371 non-null    int64  
 1   gender       371 non-null    object 
 2   mean_ulna    371 non-null    float64
 3   mean_height  371 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 11.7+ KB


In [5]:
# View first five rows
df.head()

Unnamed: 0,age,gender,mean_ulna,mean_height
0,51,male,29.0,164.3
1,53,male,29.0,168.1
2,69,female,24.0,161.0
3,57,male,29.0,164.1
4,66,male,31.0,171.0


## Split

In [6]:
# Specify features
features = ["age", "gender", "mean_ulna"]

# Specify target vector
target = "mean_height"

# Subset features
X = df[features]

# Subset target
y = df[target]

# Split dataset into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

# Inspect split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(296, 3)
(75, 3)
(296,)
(75,)


In [7]:
y_train.head()

192    163.0
75     157.8
84     152.1
359    156.0
16     164.0
Name: mean_height, dtype: float64

# Build model
## Baseline

In [8]:
# Mean of the target
y_mean = y_train.mean()

# Generate baseline predictions
y_pred_baseline = [y_mean] * len(y_train)

# Evaluate baseline predictions
## Mean absolute error
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Baseline mean absolute error", mae_baseline)

## Mean squared error
mse_baseline = mean_squared_error(y_train, y_pred_baseline)
print("Baseline mean squared error", mse_baseline)

## Root mean squared error
rmse_baseline = np.sqrt(mean_squared_error(y_train, y_pred_baseline))
print("Baseline root mean squared error", rmse_baseline)

Baseline mean absolute error 5.956830944119795
Baseline mean squared error 54.23890199849341
Baseline root mean squared error 7.3647065113617


## Iterate

### Linear regression

In [9]:
# Build model
lr_model = make_pipeline(
    OneHotEncoder(),
    LinearRegression()
)

# Fit model
lr_model.fit(X_train, y_train)

# Predict for training data
y_pred_linear = lr_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_linear = mean_absolute_error(y_train, y_pred_linear)
print("Training mean absolute error (Linear)", mae_linear)

# Mean squared error
mse_linear = mean_squared_error(y_train, y_pred_linear)
print("Training mean squared error (Linear)", mse_linear)

# Root mean squared error
rmse_linear = np.sqrt(mean_squared_error(y_train, y_pred_linear))
print("Training root mean squared error (Linear)", rmse_linear)

Training mean absolute error (Linear) 3.3598260198110053
Training mean squared error (Linear) 17.72220391653138
Training root mean squared error (Linear) 4.209774805916746


### Ridge regression


In [10]:
# Build model
ridge = make_pipeline(
    OneHotEncoder(),
    Ridge(random_state=42)
)

# Fit model
ridge.fit(X_train, y_train)

# Predict for training data
y_pred_ridge = ridge.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_ridge = mean_absolute_error(y_train, y_pred_ridge)
print("Training mean absolute error (Ridge)", mae_ridge)

# Mean squared error
mse_ridge = mean_squared_error(y_train, y_pred_ridge)
print("Training mean squared error (Ridge)", mse_ridge)

# Root mean squared error
rmse_ridge = np.sqrt(mean_squared_error(y_train, y_pred_ridge))
print("Training root mean squared error (Ridge)", rmse_ridge)

Training mean absolute error (Ridge) 3.360799155092066
Training mean squared error (Ridge) 17.722732322520113
Training root mean squared error (Ridge) 4.2098375648616315


In [17]:
# Grid search for Ridge regression
ridge_parameters = {
    'ridge__alpha': [0.1, 1, 10, 100, 1000],
    'ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

ridge_cv = GridSearchCV(ridge, ridge_parameters, cv=5)

# Fit model
ridge_cv.fit(X_train, y_train)

# Predict for training data
y_pred_ridge_cv = ridge_cv.predict(X_train)

# Best parameters and best score from cross validation
print("Best Parameters:", ridge_cv.best_params_)
print("Best Score:", ridge_cv.best_score_)

# Evaluate baseline predictions
# Mean absolute error
mae_ridge_cv = mean_absolute_error(y_train, y_pred_ridge_cv)
print("Training mean absolute error (Ridge)", mae_ridge_cv)

# Mean squared error
mse_ridge_cv = mean_squared_error(y_train, y_pred_ridge_cv)
print("Training mean squared error (Ridge)", mse_ridge_cv)

# Root mean squared error
rmse_ridge_cv = np.sqrt(mean_squared_error(y_train, y_pred_ridge_cv))
print("Training root mean squared error (Ridge)", rmse_ridge_cv)

Best Parameters: {'ridge__alpha': 1, 'ridge__solver': 'saga'}
Best Score: 0.6509778110352706
Training mean absolute error (Ridge) 3.3611494891012974
Training mean squared error (Ridge) 17.72310069381754
Training root mean squared error (Ridge) 4.209881315882615


### Lasso

In [19]:
# Build model
lasso = make_pipeline(
    OneHotEncoder(),
    Lasso(random_state=42)
)

# Fit model
lasso.fit(X_train, y_train)

# Predict for training data
y_pred_lasso = lasso.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_lasso = mean_absolute_error(y_train, y_pred_lasso)
print("Training mean absolute error (Lasso)", mae_lasso)

# Mean squared error
mse_lasso = mean_squared_error(y_train, y_pred_lasso)
print("Training mean squared error (Lasso)", mse_lasso)

# Root mean squared error
rmse_lasso = np.sqrt(mean_squared_error(y_train, y_pred_lasso))
print("Training root mean squared error (Lasso)", rmse_lasso)

Training mean absolute error (Lasso) 3.8676133080167543
Training mean squared error (Lasso) 22.908404470964527
Training root mean squared error (Lasso) 4.786272502790092


In [20]:
# Grid search for Lasso model
# Grid search for Ridge regression
lasso_parameters = {'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
    
lasso_cv = GridSearchCV(lasso, lasso_parameters, cv=5)

# Fit model
lasso_cv.fit(X_train, y_train)

# Predict for training data
y_pred_lasso_cv = lasso_cv.predict(X_train)

# Best parameters and best score from cross validation
print("Best Parameters:", lasso_cv.best_params_)
print("Best Score:", lasso_cv.best_score_)

# Evaluate baseline predictions
# Mean absolute error
mae_lasso_cv = mean_absolute_error(y_train, y_pred_lasso_cv)
print("Training mean absolute error (Lasso)", mae_lasso_cv)

# Mean squared error
mse_lasso_cv = mean_squared_error(y_train, y_pred_lasso_cv)
print("Training mean squared error (Lasso)", mse_lasso_cv)

# Root mean squared error
rmse_lasso_cv = np.sqrt(mean_squared_error(y_train, y_pred_lasso_cv))
print("Training root mean squared error (Lasso)", rmse_lasso_cv)

Best Parameters: {'lasso__alpha': 0.01}
Best Score: 0.6508809302002576
Training mean absolute error (Lasso) 3.3607827795791265
Training mean squared error (Lasso) 17.72275252317157
Training root mean squared error (Lasso) 4.2098399640807695


### Elastic net

In [None]:
# Build model
elastic = make_pipeline(
    OneHotEncoder(),
    ElasticNet()
)

# Fit model
elastic.fit(X_train, y_train)

# Predict for training data
y_pred_elastic = elastic.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_elastic = mean_absolute_error(y_train, y_pred_elastic)
print("Training mean absolute error (ElasticNet)", mae_elastic)

# Mean squared error
mse_elastic = mean_squared_error(y_train, y_pred_elastic)
print("Training mean squared error (ElasticNet)", mse_elastic)

# Root mean squared error
rmse_elastic = np.sqrt(mean_squared_error(y_train, y_pred_elastic))
print("Training root mean squared error (ElasticNet)", rmse_elastic)

### Random Forest


In [None]:
# Build model
rf_model = make_pipeline(
    OneHotEncoder(),
    RandomForestRegressor(random_state=42)
)

# Fit model
rf_model.fit(X_train, y_train)

# Predict for training data
y_pred_rf = rf_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_rf = mean_absolute_error(y_train, y_pred_rf)
print("Training mean absolute error (Random Forest)", mae_rf)

# Mean squared error
mse_rf = mean_squared_error(y_train, y_pred_rf)
print("Training mean squared error (Random Forest)", mse_rf)

# Root mean squared error
rmse_rf = np.sqrt(mean_squared_error(y_train, y_pred_rf))
print("Training root mean squared error (Random Forest)", rmse_rf)

### Gradient Boosting

In [None]:
# Build model
gb_model = make_pipeline(
    OneHotEncoder(),
    GradientBoostingRegressor(random_state=42)
)

# Fit model
gb_model.fit(X_train, y_train)

# Predict for training data
y_pred_gb = gb_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_gb = mean_absolute_error(y_train, y_pred_gb)
print("Training mean absolute error (Gradient Boosting)", mae_gb)

# Mean squared error
mse_gb = mean_squared_error(y_train, y_pred_gb)
print("Training mean squared error (Gradient Boosting)", mse_gb)

# Root mean squared error
rmse_gb = np.sqrt(mean_squared_error(y_train, y_pred_gb))
print("Training root mean squared error (Gradient Boosting)", rmse_gb)

### Extreme Gradient Boosting

In [None]:
# Build model
xgb_model = make_pipeline(
    OneHotEncoder(),
    XGBRegressor()
)

# Fit model
xgb_model.fit(X_train, y_train)

# Predict for training data
y_pred_xgb = xgb_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_xgb = mean_absolute_error(y_train, y_pred_xgb)
print("Training mean absolute error (Extreme Gradient)", mae_xgb)

# Mean squared error
mse_xgb = mean_squared_error(y_train, y_pred_xgb)
print("Training mean squared error (Extreme Gradient)", mse_xgb)

# Root mean squared error
rmse_xgb = np.sqrt(mean_squared_error(y_train, y_pred_xgb))
print("Training root mean squared error (Extreme Gradient)", rmse_xgb)

### Decision tree

In [None]:
# Build model
tree_model = make_pipeline(
    OrdinalEncoder(),
    DecisionTreeRegressor(random_state=42)
)

# Fit model
tree_model.fit(X_train, y_train)

# Predict for training data
y_pred_tree = tree_model.predict(X_train)

# Evaluate baseline predictions
# Mean absolute error
mae_tree = mean_absolute_error(y_train, y_pred_tree)
print("Training mean absolute error (Decision Tree)", mae_tree)

# Mean squared error
mse_tree = mean_squared_error(y_train, y_pred_tree)
print("Training mean squared error (Decision Tree)", mse_tree)

# Root mean squared error
rmse_tree = np.sqrt(mean_squared_error(y_train, y_pred_tree))
print("Training root mean squared error (Decision Tree)", rmse_tree)

In [None]:
print(tree_model.named_steps["decisiontreeregressor"].get_depth())

# Evaluate

# Communicate