Jairo El Yazidi Rios

Tea Shkurti
# Tutorial 1 - Recommender Systems

Bootstrap is a statistical resampling technique that is used to estimate the quality of a model. It involves creating a large number of random samples with replacement from a dataset and using each of these samples to build and evaluate a model. By aggregating the results from each of these models, we can get a more accurate estimate of the model's performance.

In this tutorial, we will explore the steps involved in using bootstrap to estimate the quality of a model. We will use the Python programming language and several popular libraries for data manipulation and modeling, including numpy, pandas, and scikit-learn.

Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

**Loading the data**

We are analyzing a bank dataset that includes demographic features of clients, such as their education level and whether they have a loan, as well as information about whether they accepted or declined a product offered to them during a contact with the bank. The dataset provides valuable insights into client behavior and can help the bank optimize its sales strategies.

In [None]:
df = pd.read_csv('bank-additional-full.csv',sep=";")
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


Columns with real variables

In [None]:
realColumns = [0,10,11,13,15,16,17,18,19]
df_real = df.iloc[:,realColumns]
df_real

Unnamed: 0,age,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,261,1,0,1.1,93.994,-36.4,4.857,5191.0
1,57,149,1,0,1.1,93.994,-36.4,4.857,5191.0
2,37,226,1,0,1.1,93.994,-36.4,4.857,5191.0
3,40,151,1,0,1.1,93.994,-36.4,4.857,5191.0
4,56,307,1,0,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...
41183,73,334,1,0,-1.1,94.767,-50.8,1.028,4963.6
41184,46,383,1,0,-1.1,94.767,-50.8,1.028,4963.6
41185,56,189,2,0,-1.1,94.767,-50.8,1.028,4963.6
41186,44,442,1,0,-1.1,94.767,-50.8,1.028,4963.6


Columns with categorical variables

In [None]:
categoricalColumns = np.arange(0,21)[~np.isin(np.arange(0,21),np.array(realColumns))]

df_categorical = df.iloc[:,categoricalColumns]
df_categorical

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,pdays,poutcome,y
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,999,nonexistent,no
1,services,married,high.school,unknown,no,no,telephone,may,mon,999,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,may,mon,999,nonexistent,no
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,999,nonexistent,no
4,services,married,high.school,no,no,yes,telephone,may,mon,999,nonexistent,no
...,...,...,...,...,...,...,...,...,...,...,...,...
41183,retired,married,professional.course,no,yes,no,cellular,nov,fri,999,nonexistent,yes
41184,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,999,nonexistent,no
41185,retired,married,university.degree,no,yes,no,cellular,nov,fri,999,nonexistent,no
41186,technician,married,professional.course,no,no,no,cellular,nov,fri,999,nonexistent,yes


Change categorical to dummies variable

In [None]:
pd.get_dummies(df_categorical.iloc[:, 0])

Unnamed: 0,admin.,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,unknown
0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
41183,0,0,0,0,0,1,0,0,0,0,0,0
41184,0,1,0,0,0,0,0,0,0,0,0,0
41185,0,0,0,0,0,1,0,0,0,0,0,0
41186,0,0,0,0,0,0,0,0,0,1,0,0


**Final dataset**

In [None]:
X = np.array(df_real)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


for i in range(df_categorical.shape[1]):
    X = np.c_[X,np.array(pd.get_dummies(df_categorical.iloc[:, i]))]

print(X.shape)
X

(41188, 91)


array([[0.48148148, 0.05307035, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.49382716, 0.03029687, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.24691358, 0.04595364, 0.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.48148148, 0.03843026, 0.01818182, ..., 0.        , 1.        ,
        0.        ],
       [0.33333333, 0.08987393, 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.7037037 , 0.04859699, 0.03636364, ..., 0.        , 1.        ,
        0.        ]])

Extracting the target variable

In [None]:
y = X[:,X.shape[1]-1]
X = X[:,0:(X.shape[1]-2)]

print(X.shape)
print(y.shape)

print(f"# of positive classes: {y.sum()}")
print(f"# of negatime classes: {y.shape[0]-y.sum()}")

(41188, 89)
(41188,)
# of positive classes: 4640.0
# of negatime classes: 36548.0


Training and test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=123)
print(f"Training {X_train.shape} {y_train.shape}")
print(f"Test {X_test.shape} {y_test.shape}")

Training (39128, 89) (39128,)
Test (2060, 89) (2060,)


Classifing and predicting accuracy with KNN algorithm

In [None]:
model = KNeighborsClassifier(n_neighbors=100)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
np.sum(prediction == y_test) / y_test.shape[0]

0.9033980582524271

# Exercise

In this exercise, we will use bootstrapping to estimate the generalization error of the K-nearest neighbors (KNN) algorithm using various metrics including f1 score, accuracy, recall, and precision. Follow the instructions below and report the results:

For each metric, repeat the following process 100 times:

* Randomly select 10% of the data as the test set.
* Split the remaining 90% of the data into a training set and a validation set (in a 90/10 ratio).
* Fit the KNN model for different values of K and evaluate the validation performance according to the selected metric.
* Determine the best value of K based on the validation performance.
* Estimate the performance of the KNN model with the best K using the test set.

Finally, for each metric, calculate the average performance across the 100 executions for each metric.

In [None]:
#Your code here
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
df = pd.read_csv('bank-additional-full.csv',sep=";")

realColumns = [0,10,11,13,15,16,17,18,19]
df_real = df.iloc[:,realColumns]
categoricalColumns = np.arange(0,21)[~np.isin(np.arange(0,21),np.array(realColumns))]
df_categorical = df.iloc[:,categoricalColumns]
X = np.array(df_real)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
for i in range(df_categorical.shape[1]):
    X = np.c_[X,np.array(pd.get_dummies(df_categorical.iloc[:, i]))]

In [None]:
# Accuracy
scores = []
for i in range(100):
  data_train, data_test = train_test_split(X, test_size=0.1, random_state=i)
  X_test = data_test[:, :-1]
  y_test = data_test[:, -1]
  data_train, data_val = train_test_split(data_train, test_size=0.1, random_state=i)
  X_train = data_train[:, :-1]
  y_train = data_train[:, -1]
  X_val = data_val[:, :-1]
  y_val = data_val[:, -1]
  neighbors = [1, 2, 3, 4, 5, 10, 18, 25]
  best_acc = 0
  K = 1
  for k in neighbors:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    acc = accuracy_score(y_val, prediction)
    if acc > best_acc:
      best_acc = acc
      K = k
  print(K)
  model = KNeighborsClassifier(n_neighbors=K)
  model.fit(X_train, y_train)
  prediction = model.predict(X_test)
  acc = accuracy_score(y_test, prediction)
  scores.extend([acc])
  print(scores)
print(np.mean(scores))

3
[0.9417334304442826]
5
[0.9417334304442826, 0.9380917698470502]
3
[0.9417334304442826, 0.9380917698470502, 0.9470745326535567]
5
[0.9417334304442826, 0.9380917698470502, 0.9470745326535567, 0.943918426802622]
5
[0.9417334304442826, 0.9380917698470502, 0.9470745326535567, 0.943918426802622, 0.9373634377276038]
3
[0.9417334304442826, 0.9380917698470502, 0.9470745326535567, 0.943918426802622, 0.9373634377276038, 0.9393056567127943]
3
[0.9417334304442826, 0.9380917698470502, 0.9470745326535567, 0.943918426802622, 0.9373634377276038, 0.9393056567127943, 0.9446467589220685]
3
[0.9417334304442826, 0.9380917698470502, 0.9470745326535567, 0.943918426802622, 0.9373634377276038, 0.9393056567127943, 0.9446467589220685, 0.9380917698470502]
5
[0.9417334304442826, 0.9380917698470502, 0.9470745326535567, 0.943918426802622, 0.9373634377276038, 0.9393056567127943, 0.9446467589220685, 0.9380917698470502, 0.9373634377276038]
5
[0.9417334304442826, 0.9380917698470502, 0.9470745326535567, 0.94391842680262

KeyboardInterrupt: ignored

In [None]:
# Recall
scores = []
for i in range(100):
  data_train, data_test = train_test_split(X, test_size=0.1, random_state=i)
  X_test = data_test[:, :-1]
  y_test = data_test[:, -1]
  data_train, data_val = train_test_split(data_train, test_size=0.1, random_state=i)
  X_train = data_train[:, :-1]
  y_train = data_train[:, -1]
  X_val = data_val[:, :-1]
  y_val = data_val[:, -1]
  neighbors = [1, 2, 3, 4, 5, 10, 18, 25]
  best_rec = 0
  K = 1
  for k in neighbors:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    rec = recall_score(y_val, prediction)
    if rec > best_rec:
      best_rec = rec
      K = k
  print(K)
  model = KNeighborsClassifier(n_neighbors=K)
  model.fit(X_train, y_train)
  prediction = model.predict(X_test)
  rec = recall_score(y_test, prediction)
  scores.extend([rec])
  print(scores)
print(np.mean(scores))

In [None]:
# Precision
scores = []
for i in range(100):
  data_train, data_test = train_test_split(X, test_size=0.1, random_state=i)
  X_test = data_test[:, :-1]
  y_test = data_test[:, -1]
  data_train, data_val = train_test_split(data_train, test_size=0.1, random_state=i)
  X_train = data_train[:, :-1]
  y_train = data_train[:, -1]
  X_val = data_val[:, :-1]
  y_val = data_val[:, -1]
  neighbors = [1, 2, 3, 4, 5, 10, 18, 25]
  best_pre = 0
  K = 1
  for k in neighbors:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    pre = precision_score(y_val, prediction)
    if pre > best_pre:
      best_pre = pre
      K = k
  print(K)
  model = KNeighborsClassifier(n_neighbors=K)
  model.fit(X_train, y_train)
  prediction = model.predict(X_test)
  pre = precision_score(y_test, prediction)
  scores.extend([pre])
  print(scores)
print(np.mean(scores))

In [None]:
# F1 score
scores = []
for i in range(100):
  data_train, data_test = train_test_split(X, test_size=0.1, random_state=i)
  X_test = data_test[:, :-1]
  y_test = data_test[:, -1]
  data_train, data_val = train_test_split(data_train, test_size=0.1, random_state=i)
  X_train = data_train[:, :-1]
  y_train = data_train[:, -1]
  X_val = data_val[:, :-1]
  y_val = data_val[:, -1]
  neighbors = [1, 2, 3, 4, 5, 10, 18, 25]
  best_f1 = 0
  K = 1
  for k in neighbors:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    f1 = f1_score(y_val, prediction)
    if f1 > best_f1:
      best_f1 = f1
      K = k
  print(K)
  model = KNeighborsClassifier(n_neighbors=K)
  model.fit(X_train, y_train)
  prediction = model.predict(X_test)
  f1 = f1_score(y_test, prediction)
  scores.extend([f1])
  print(scores)
print(np.mean(scores))