In [None]:
# importing libraries, etc...

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

path = "https://raw.githubusercontent.com/LennardVaarten/ML-Workshops/main/data/"

The [Gapminder](https://www.gapminder.org/) dataset contains historical data (mid-19th century onwards) containing hundreds of indicators such as life expectancy and GDP for countries around the world.
For our purpose, we will try to predict the life expectancy of countries based on several of these indicators.

To make experimenting with Cross-Validation and Grid Search on the life_expectancy dataset a bit more feasible, I have only included data from the year 2018. I have titled this subset of the life_expectancy dataset life_expectancy.csv.

In [None]:
# loading the data

life_expectancy = pd.read_csv(path+"life_expectancy.csv")

In [None]:
# viewing the data

life_expectancy

In [None]:
# checking the number of missing values per feature

life_expectancy.isna().sum()

In [None]:
# imputing missing values using the k-NN algorithm, with n_neighbors=3

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3).fit(life_expectancy.iloc[:,:-1])
life_expectancy.iloc[:,:-1] = imputer.transform(life_expectancy.iloc[:,:-1])

In [None]:
# voila: no more missing values!

life_expectancy.isna().sum()

In [None]:
life_expectancy

In [None]:
# scaling

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(life_expectancy.iloc[:,1:-1])
life_expectancy.iloc[:,1:-1] = scaler.transform(life_expectancy.iloc[:, 1:-1])

In [None]:
# splitting into training and test set

from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(life_expectancy.iloc[:,1:-1],
                                                                                       life_expectancy.iloc[:,0],
                                                                                       test_size=0.35,
                                                                                       random_state=99)

In [None]:
train = pd.concat([target_train, features_train], axis=1)

fig, axes = plt.subplots(3,3, figsize=(18,16))

for i in range(len(train.columns)-1):
    sns.scatterplot(data=train, ax=axes[i//3, i%3], x=train.columns[i+1], y=train.columns[0])

fig.tight_layout(pad=2)

In [None]:
# using Grid Search and Cross Validation to find the optimal parameters. Here, I have used 10 folds, but feel free to use 
# more or fewer in the model(s) you make below!

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

params = {
    "n_neighbors": [1, 3, 5, 7, 9, 11],
    "weights": ["uniform", "distance"]
}

knn = GridSearchCV(estimator=KNeighborsRegressor(),
                   param_grid=params, cv=10) 

knn.fit(features_train, target_train)

print("Training set score: {:.4f}".format(knn.score(features_train, target_train)))
print("Test set score: {:.4f}".format(knn.score(features_test, target_test)))
print(knn.best_params_)

Now, it's your turn to use any of the models we've discussed to see how well they perform on this task. Since this dataset is significantly smaller than the mnist (handwritten digits) dataset, it is very feasible - and, practically a requirement - to use Grid Search and Cross Validation to build and test your models. Note that this is a regression problem and classification models will thus not work on it. Perhaps even more important than choosing a classifier is trying out different parameter settings (e.g. n_neighbors for k-Nearest Neighbors, C for Logistic Regression, n_estimators for the Random Forest Classifier, etc...). 

Below are the regression models we've discussed, along with the import statement and the parameters that we've covered during the sessions.

- **k-Nearest Neighbors Regression** (already imported in the cell above)
    - n_neighbors (any number above 0)
    - weights ("uniform", "distance")
- **Linear Regression** (from sklearn.linear_model import LinearRegression)
    - C
- **Ridge Regression** (from sklearn.linear_model import Ridge)
    - alpha (any number above 0)
- **Lasso Regression** (from sklearn.linear_model import Lasso)
    - alpha (any number above 0)
- **Decision Tree Regression** (from sklearn.tree import DecisionTreeRegressor)
    - max_depth (a whole number above 0)
    - min_samples_split (a whole number above 1)
- **Random Forest Regression** (from sklearn.ensemble import RandomForestRegressor)
    - n_estimators (a whole number above 0)
    - max_depth (a whole number above 0)
    - min_samples_split (a whole number above 1)
- **Gradient Boosting Regressor** (from sklearn.ensemble import GradientBoostingRegressor)
    - n_estimators (a whole number above 0)
    - max_depth (a whole number above 0)
    - min_samples_split (a whole number above 1)
    - learning_rate (a number between 0 and 1)
    - subsample (a number between 0 and 1)
    
If you want to access even more parameter settings than we've discussed in class (models tend to have a lot), you can also access the sklearn documentation. For example, [here](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html), you can find all possible parameters to tune for the KNeighborsClassifier.

Good luck and feel free to share your model(s) (and the results you obtain with it) on the Canvas discussion page!

In [None]:
# Example without grid search: Linear Regression

from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(features_train, target_train)

print("Training set score: {:.4f}".format(lr.score(features_train, target_train)))
print("Test set score: {:.4f}".format(lr.score(features_test, target_test)))

In [None]:
# Now let's go one step further and try Gradient Boosting, with parameters optimized using Grid Search + Cross-Validation

from sklearn.ensemble import GradientBoostingRegressor

params = {
    "n_estimators": [500],
    "learning_rate": [0.01, 0.05, 0.1, 0.15],
    "max_depth": [3, 6, None],
}

gbr = GridSearchCV(estimator=GradientBoostingRegressor(),
                   param_grid=params, cv=5, verbose=1) 

gbr.fit(features_train, target_train)

print(gbr.best_params_)
print("Training set score: {:.4f}".format(gbr.best_score_))
print("Test set score: {:.4f}".format(gbr.score(features_test, target_test)))

In [None]:
# To do even better, we might have to get a little more creative. 
# Let's go back to using linear models, but after adding polynomial features and interaction terms.

life_expectancy = pd.read_csv(path+"life_expectancy.csv")
imputer = KNNImputer(n_neighbors=3).fit(life_expectancy.iloc[:,:-1])
life_expectancy.iloc[:,:-1] = imputer.transform(life_expectancy.iloc[:,:-1])

le_poly = pd.DataFrame([life_expectancy.country]).T
colsDone = set()

for col in life_expectancy.columns[1:-1]:
    le_poly.insert(0, col, life_expectancy[col])
    le_poly.insert(0, f"{col}**2", life_expectancy[col]**2)
    for col2 in life_expectancy.iloc[:,1:-1].columns:
        if col2 in colsDone:
          continue
        elif col != col2:
            le_poly.insert(0, f"{col} * {col2}", life_expectancy[col] * life_expectancy[col2])
    colsDone.add(col)

le_poly.insert(0, "life_expectancy_years", life_expectancy["life_expectancy_years"])

scaler = MinMaxScaler().fit(le_poly.iloc[:, 1:-1])
le_poly.iloc[:,1:-1] = scaler.transform(le_poly.iloc[:, 1:-1])

In [None]:
le_poly

In [None]:
# splitting

poly_features_train, poly_features_test, poly_target_train, poly_target_test = train_test_split(le_poly.iloc[:,1:-1],
                                                                                       le_poly.iloc[:,0],
                                                                                       test_size=0.35,
                                                                                       random_state=99)

In [None]:
# linear regression with polynomial features

from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(poly_features_train, target_train)

print("{:.4f}".format(lr.score(poly_features_test, target_test)))

In [None]:
# Since we have so many features (and few data points), Lasso might be a good idea...

from sklearn.linear_model import Lasso

params = {
    "alpha": [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
}

lasso = GridSearchCV(estimator=Lasso(),
                   param_grid=params, cv=10, n_jobs=-1, verbose=1) 

lasso.fit(poly_features_train, poly_target_train)

print(lasso.best_params_)
print("CV score: {:.4f}".format(lasso.best_score_))
print("Test set score: {:.4f}".format(lasso.score(poly_features_test, poly_target_test)))

In [None]:
# how many features were used?

print("Total features used: {}".format(len(list(poly_features_train.columns[(lasso.best_estimator_.coef_ != 0).ravel()]))))
print("Total features discarded: {}".format(len(list(poly_features_train.columns[(lasso.best_estimator_.coef_ == 0).ravel()]))))

In [None]:
# what features were used?

print("Features used:")

for i in poly_features_train.columns[(lasso.best_estimator_.coef_ != 0).ravel()]:
  print(i)

In [None]:
# what did our model predict?

y_pred_y = pd.DataFrame()

y_pred_y["y"] = poly_target_test
y_pred_y["y_pred"] = lasso.predict(poly_features_test)
y_pred_y.insert(0, "country", [life_expectancy.loc[i, "country"] for i in y_pred_y.index])

y_pred_y

In [None]:
# just for fun, let's do a manual calculation of R2

y_pred_y["residuals"] = (y_pred_y["y"] - y_pred_y["y_pred"])
y_pred_y["squared_residuals"] = (y_pred_y["residuals"])**2

y_pred_y

In [None]:
y_pred_y["mean_target_test"] = np.repeat(np.mean(y_pred_y["y"]), y_pred_y.shape[0])

y_pred_y

In [None]:
y_pred_y["squared_residuals_from_mean"] = (y_pred_y["y"] - y_pred_y["mean_target_test"])**2

y_pred_y

In [None]:
print(1 - (sum(y_pred_y["squared_residuals"]) / sum(y_pred_y["total_squared_residuals"])))