In [None]:
# importing libraries, etc...

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

path = "https://raw.githubusercontent.com/LennardVaarten/ML-Workshops/main/data/"

The [Gapminder](https://www.gapminder.org/) dataset contains historical data (mid-19th century onwards) containing hundreds of indicators such as life expectancy and GDP for countries around the world.
For our purpose, we will try to predict the life expectancy of countries based on several of these indicators. I have only included data from the year 2018.

In [None]:
# loading

life_expectancy = pd.read_csv(path+"life_expectancy.csv")

In [None]:
# viewing

life_expectancy

In [None]:
# checking the number of missing values per feature

life_expectancy.isna().sum()

In [None]:
# imputing missing values using the k-NN algorithm, with n_neighbors=3

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3).fit(life_expectancy.iloc[:,:-1])
life_expectancy.iloc[:,:-1] = imputer.transform(life_expectancy.iloc[:,:-1])

In [None]:
# voila: no more missing values!

life_expectancy.isna().sum()

In [None]:
life_expectancy

In [None]:
# scaling

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(life_expectancy.iloc[:,1:-1])
life_expectancy.iloc[:,1:-1] = scaler.transform(life_expectancy.iloc[:, 1:-1])

In [None]:
# splitting into training set and test set

from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(life_expectancy.iloc[:,1:-1],
                                                                            life_expectancy.iloc[:,0],
                                                                            test_size=0.35,
                                                                            random_state=99)

In [None]:
train = pd.concat([target_train, features_train], axis=1)

fig, axes = plt.subplots(3,3, figsize=(18,16))

for i in range(len(train.columns)-1):
    sns.scatterplot(data=train, ax=axes[i//3, i%3], x=train.columns[i+1], y=train.columns[0])

fig.tight_layout(pad=2)

Let's get to work: try changing the parameters 'n_neighbors' and 'weights' in the k-NN model below. What happens to the scores on the training set and the test set as you tweak these parameters? What's the best model you can find? Share it on the discussion board!

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5, weights="uniform")

knn.fit(features_train, target_train)

print("Training set score: {:.4f}".format(knn.score(features_train, target_train)))
print("Test set score: {:.4f}".format(knn.score(features_test, target_test)))

Now, run a linear regression model. You can simply copy some of my code from the 'schools' notebook. Does it do better than *k*-NN?

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(features_train, target_train)

print("Training set score: {:.4f}".format(lr.score(features_train, target_train)))
print("Test set score: {:.4f}".format(lr.score(features_test, target_test)))