# Imports

In [None]:
import pandas as pd
import numpy as np
import missingno
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
import scipy.stats as stats
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import RobustScaler
from scipy.spatial import distance
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import warnings



In [None]:
warnings.filterwarnings("ignore")


# Question 1:

### Reading the dataset

In [None]:
df_raw = pd.read_csv('Volley_Ball_Score.csv')
df_raw


In [None]:
df_raw.shape


### Checking and changing types

In [None]:
data_types_dict = {'Performance': 'category', 'Country': 'category'}
df_raw = df_raw.astype(data_types_dict)
df_raw.dtypes


There are 9 Numerical features and 2 Categorical features

In [None]:
df_raw


### Checking duplicates

In [None]:
df_raw.duplicated(keep=False).sum()


### Checking nulls

In [None]:
df_raw.isnull().mean()


In [None]:
df_raw.isnull().sum().sum()


3 columns have have missing values but none of them have more than 70% of rows with missing data. I will handle the missing values instead of elimination. 

In [None]:
missingno.bar(df_raw.iloc[:, :], color="green", figsize=(10, 5), fontsize=12)


In [None]:
missingno.matrix(df_raw.iloc[:, :], figsize=(
    10, 5), fontsize=12, sparkline=False)


In [None]:
missingno.heatmap(df_raw.iloc[:, :8], figsize=(10, 5), fontsize=12)


Making a deep copy of the dataset and using different duplicate filling methods.

## Missing values with kNN

In [None]:
df = df_raw.copy(deep=True)
df.drop(columns=['Country', 'Performance', 'Unnamed: 0'], inplace=True)


knn_imp = KNNImputer(n_neighbors=3)
# fit and transform the imputer on the dataset
df_knn = pd.DataFrame(knn_imp.fit_transform(df), columns=df.columns)


In [None]:
df_knn.isnull().sum()


### Plots of before and after

In [None]:
df['Player_Score_3'].plot.kde(color='r')
df_knn["Player_Score_3"].plot.kde(color='y')


In [None]:
df['Player_Score_4'].plot.kde(color='r')
df_knn["Player_Score_4"].plot.kde(color='y')


In [None]:
df['Player_Score_5'].plot.kde(color='r')
df_knn["Player_Score_5"].plot.kde(color='y')


In [None]:
# calculating the jensenshannon distance between the probability distributions before and after imputation

counts_imputed, nins_imputed, values_imputed = plt.hist(
    df_knn["Player_Score_3"])
counts_imputed_probabilities = counts_imputed / counts_imputed.sum()

# probability distribution before imputation
counts, nins, values = plt.hist(df["Player_Score_3"])
counts_probabilities = counts / counts.sum()

# pdf distance calculation
distance.jensenshannon(counts_imputed_probabilities, counts_probabilities)


In [None]:
# calculating the jensenshannon distance between the probability distributions before and after imputation

counts_imputed, nins_imputed, values_imputed = plt.hist(
    df_knn["Player_Score_4"])
counts_imputed_probabilities = counts_imputed / counts_imputed.sum()

# probability distribution before imputation
counts, nins, values = plt.hist(df["Player_Score_4"])
counts_probabilities = counts / counts.sum()

# pdf distance calculation
distance.jensenshannon(counts_imputed_probabilities, counts_probabilities)


In [None]:
# calculating the jensenshannon distance between the probability distributions before and after imputation

counts_imputed, nins_imputed, values_imputed = plt.hist(
    df_knn["Player_Score_5"])
counts_imputed_probabilities = counts_imputed / counts_imputed.sum()

# probability distribution before imputation
counts, nins, values = plt.hist(df["Player_Score_5"])
counts_probabilities = counts / counts.sum()

# pdf distance calculation
distance.jensenshannon(counts_imputed_probabilities, counts_probabilities)


## Missing values with Regression

In [None]:
# Choosing the columns without nan values to impute Player_Score_5.
df_regr5 = df_raw[['Player_Score_0', 'Player_Score_1',
                   'Player_Score_2', 'Player_Score_5', 'Player_Score_6']]


lr_model = LinearRegression()


train_data = df_regr5[df_regr5['Player_Score_5'].isnull() == False]
test_data = df_regr5[df_regr5['Player_Score_5'].isnull() == True]

ps_5_before_imp = train_data['Player_Score_5']

y = train_data['Player_Score_5']  # target is "Player_Score_5"
train_data.drop("Player_Score_5", axis=1, inplace=True)
# features are all other features except "Player_Score_5"

lr_model.fit(train_data, y)

test_data.drop("Player_Score_5", axis=1, inplace=True)

# infer the missing values with the learned model
pred = lr_model.predict(test_data)
test_data['Player_Score_5'] = pred

ps_5_lr = ps_5_before_imp.append(test_data['Player_Score_5'])


In [None]:
# Choosing the columns without nan values to impute Player_Score_4.
df_regr4 = df_raw[['Player_Score_0', 'Player_Score_1',
                   'Player_Score_2', 'Player_Score_4', 'Player_Score_6']]


lr_model = LinearRegression()


train_data = df_regr4[df_regr4['Player_Score_4'].isnull() == False]
test_data = df_regr4[df_regr4['Player_Score_4'].isnull() == True]

ps_4_before_imp = train_data['Player_Score_4']

y = train_data['Player_Score_4']  # target is "Player_Score_4"
train_data.drop("Player_Score_4", axis=1, inplace=True)
# features are all other features except "Player_Score_4"

lr_model.fit(train_data, y)

test_data.drop("Player_Score_4", axis=1, inplace=True)

# infer the missing values with the learned model
pred = lr_model.predict(test_data)
test_data['Player_Score_4'] = pred

ps_4_lr = ps_4_before_imp.append(test_data['Player_Score_4'])


In [None]:
# Choosing the columns without nan values to impute Player_Score_3.
df_regr3 = df_raw[['Player_Score_0', 'Player_Score_1',
                   'Player_Score_2', 'Player_Score_3', 'Player_Score_6']]

lr_model = LinearRegression()


train_data = df_regr3[df_regr3['Player_Score_3'].isnull() == False]
test_data = df_regr3[df_regr3['Player_Score_3'].isnull() == True]

ps_3_before_imp = train_data['Player_Score_3']

y = train_data['Player_Score_3']  # target is "Player_Score_3"
train_data.drop("Player_Score_3", axis=1, inplace=True)
# features are all other features except "Player_Score_3"

lr_model.fit(train_data, y)

test_data.drop("Player_Score_3", axis=1, inplace=True)

# infer the missing values with the learned model
pred = lr_model.predict(test_data)
test_data['Player_Score_3'] = pred

ps_3_lr = ps_3_before_imp.append(test_data['Player_Score_3'])


In [None]:
# visualising the distribution before and after imputation

# plotting the pdf after imputation
df_regr5['Player_Score_5'].plot.kde(color='r')

# plotting the pdf before imputation
ps_5_lr.plot.kde(color='y')


In [None]:
# visualising the distribution before and after imputation

# plotting the pdf after imputation
df_regr4['Player_Score_4'].plot.kde(color='r')

# plotting the pdf before imputation
ps_4_lr.plot.kde(color='y')


In [None]:
# visualising the distribution before and after imputation

# plotting the pdf after imputation
df_regr3['Player_Score_3'].plot.kde(color='r')

# plotting the pdf before imputation
ps_3_lr.plot.kde(color='y')


Checking both algorithms and choosing the best one

In [None]:
# calculating the jensenshannon distance between the probability distributions before and after imputation
# probability distribution after imputation
counts_imputed, nins_imputed, values_imputed = plt.hist(ps_3_lr)
counts_imputed_probabilities = counts_imputed / counts_imputed.sum()

# probability distribution before imputation
counts, nins, values = plt.hist(ps_3_before_imp)
counts_probabilities = counts / counts.sum()

# pdf distance calculation
distance.jensenshannon(counts_imputed_probabilities, counts_probabilities)


In [None]:
# calculating the jensenshannon distance between the probability distributions before and after imputation
# probability distribution after imputation
counts_imputed, nins_imputed, values_imputed = plt.hist(ps_4_lr)
counts_imputed_probabilities = counts_imputed / counts_imputed.sum()

# probability distribution before imputation
counts, nins, values = plt.hist(ps_4_before_imp)
counts_probabilities = counts / counts.sum()

# pdf distance calculation
distance.jensenshannon(counts_imputed_probabilities, counts_probabilities)


In [None]:
# calculating the jensenshannon distance between the probability distributions before and after imputation
# probability distribution after imputation
counts_imputed, nins_imputed, values_imputed = plt.hist(ps_5_lr)
counts_imputed_probabilities = counts_imputed / counts_imputed.sum()

# probability distribution before imputation
counts, nins, values = plt.hist(ps_5_before_imp)
counts_probabilities = counts / counts.sum()

# pdf distance calculation
distance.jensenshannon(counts_imputed_probabilities, counts_probabilities)


In [None]:
# plotting the pdf after imputation using knn
df_knn['Player_Score_3'].plot.kde(color='r')

# plotting the pdf before imputation using linear regression
ps_3_lr.plot.kde(color='y')

# plotting the pdf before imputation without imputation
ps_3_before_imp.plot.kde(color='b')


In [None]:
# plotting the pdf after imputation using knn
df_knn['Player_Score_4'].plot.kde(color='r')

# plotting the pdf before imputation using linear regression
ps_4_lr.plot.kde(color='y')

# plotting the pdf before imputation without imputation
ps_4_before_imp.plot.kde(color='b')


In [None]:
# plotting the pdf after imputation using knn
df_knn['Player_Score_5'].plot.kde(color='r')

# plotting the pdf before imputation using linear regression
ps_5_lr.plot.kde(color='y')

# plotting the pdf before imputation without imputation
ps_5_before_imp.plot.kde(color='b')


## 2. Linear Regression

### (a) Verify the features values distribution of the numerical variables

In [None]:
df_knn.describe()


In [None]:
df_knn.iloc[:, :].hist(figsize=(15, 15))


In [None]:
def hist_df(data, rows_max, cols_max):
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    row = 1
    col = 1
    fig = make_subplots(rows=rows_max, cols=cols_max)
    for i in data.columns:
        if row == rows_max:
            fig.append_trace(go.Histogram(x=data[i], name=i), row=row, col=col)
            col += 1
            row = 1
            continue
        fig.append_trace(go.Histogram(x=data[i], name=i), row=row, col=col)
        row += 1

    fig.show()


hist_df(df_knn, rows_max=2, cols_max=6)


### (b) Is features transformation necessary for the numerical variables? Let’s take into account that we are preparing the dataset for a Linear Regression task, with the goal of building a "Score" predictive model. If transformation is necessary, after justifying your choices, do proceed as described.

In [None]:
# Using minmax scaler from sklearn.preprocessing to scale the numerical columns in the dataframe.
from sklearn.preprocessing import MinMaxScaler

numerical_features = ['Player_Score_0', 'Player_Score_1', 'Player_Score_2',
                      'Player_Score_3', 'Player_Score_4', 'Player_Score_5', 'Player_Score_6', 'Score']


df_scaled = df_knn[numerical_features]
transformer = RobustScaler().fit(df_scaled)
transformer.transform(df_scaled)


In [None]:
hist_df(df_scaled, 2, 6)


### (c) Verify the presence of outliers and eventually handle them. Justify your choices.

In [None]:
for i in df_scaled.columns:
    # discovering outliers with IQR-score
    Q1 = df_scaled[i].quantile(0.05)
    Q3 = df_scaled[i].quantile(0.95)
    IQR = Q3 - Q1
    print(IQR)

    # DROP
    logical_index_not_outliers = (df_scaled[i] > (
        Q1 - 1.5 * IQR)) & (df_scaled[i] < (Q3 + 1.5 * IQR))
    df_scaled = df_scaled[logical_index_not_outliers]
    # CAP
    df_scaled.loc[(df_scaled[i] < Q1), i] = Q1
    df_scaled.loc[(df_scaled[i] > Q3), i] = Q3
df_scaled.shape


There are no outliers to be handled since the robust scaling handled them, hence nothing was dropped

### (d) Is encoding necessary for the categorical variables? If yes, which kind of encoding? Specify your choices, justify them and perform categorical data encoding, if necessary.

In [None]:
# encode using sklearn
df_scaled['Country'] = df_raw['Country']
df_scaled['Performance'] = df_raw['Performance']


In [None]:
# Replace a categorical value with a specific numeric one
dictionary = {"Performance": {'below_average': 0, 'neutral': 1,
                              'average': 2, 'above_average': 3, 'extraordinary': 4}}
df_scaled.replace(dictionary, inplace=True)
# Replace a categorical value with a specific numeric one
dictionary = {'France': 0, 'Finland': 1, 'Germany': 2,
              'Norway': 3, 'Switzerland': 4, 'The_Netherlands': 5, 'Italy': 6}
df_scaled.replace(dictionary, inplace=True)

display(df_scaled)


### (e) Increase the dimensionality of the dataset introducing Polynomial Features – degree = 3 (continuous variables)

In [None]:
numerical_features = ['Player_Score_0', 'Player_Score_1', 'Player_Score_2',
                      'Player_Score_3', 'Player_Score_4', 'Player_Score_5', 'Player_Score_6']

df_dimensionality = df_scaled[numerical_features].copy(deep=True)
df_dimensionality


In [None]:

poly = PolynomialFeatures(degree=3, include_bias=False)
poly.fit(df_dimensionality)
X_poly = poly.transform(df_dimensionality)


In [None]:
df_scaled['Performance'] = df_scaled['Performance'].astype('float64')
df_scaled.dtypes


### (f) Eventually include any other transformation which might be necessary/appropriate and justify your choices.

## 3. Features Selection

### (a) Perform One Way ANOVA and test the relationship between variable Country and Score. Eventually, consider the possibility to remove the feature. Justify your choice.

In [None]:
groups = pd.unique(df_scaled.Country.values)
groups


In [None]:
groups = pd.unique(df_scaled.Country.values)
d_data = {grp: df_scaled['Score'][df_scaled.Country == grp] for grp in groups}


95% confidence test

In [None]:
# Using scipy f_oneway to calculate the p value
# 'France':0, 'Finland':1, 'Germany':2, 'Norway':3, 'Switzerland':4, 'The_Netherlands':5, 'Italy':6
from scipy import stats
Fcritical = 3.179
F, p = stats.f_oneway(d_data[0], d_data[1], d_data[2],
                      d_data[3], d_data[4], d_data[5], d_data[6])


In [None]:
if F > Fcritical:
    print("reject null hypothesis H0")
else:
    print("accept null hypothesis H0")


We accept the null hypothesis meaning that it doesnt exist variance between the groups, I won't include this feature for model training.

In [None]:
df_model = df_scaled.copy(deep=True)
df_model = df_model.drop(columns='Country')
df_model


### (b) Perform Features Selection and visualize the features which have been selected. Select one appropriate methodology for features selection and justify your choice.

In [None]:
df_train = df_model.copy(deep=True)
df_train.drop(columns='Score', inplace=True)
y = df_model['Score'].copy(deep=True)
X_train, X_test, y_train, y_test = train_test_split(
    df_train, y, random_state=23102002, test_size=.5)

In [None]:
# use f_classif (the default) and SelectPercentile to select 50% of features
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
y_train = y_train.astype('int')
# transform train set
X_train_selected = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))


In [None]:
mask = select.get_support()

print(mask)

# visualize the mask -- black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())


In [None]:
df_train = df_model.copy(deep=True)
df_train.drop(columns='Score', inplace=True)
y = df_model['Score'].copy(deep=True)
X_train, X_test, y_train, y_test = train_test_split(
    df_train, y, random_state=23102002, test_size=.5)



In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# select the k best features based on ANOVA F-value between label/feature (classification tasks)
# ANOVA F-value tells us if the tuple of variables (label-feature) are jointly significant.
k_best = SelectKBest(k=6)  # Select features according to the k highest score
fit = k_best.fit(X_train, y_train)

# transform training set
X_train_selected = k_best.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))


In [None]:
mask = k_best.get_support()
print(mask)
# visualize the mask -- black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())


Won't use this feature selection method because it's not really useful for the amount of features we have and I think we will need most of the features present in the dataset

In [None]:
df_train = df_model.copy(deep=True)
df_train.drop(columns='Score', inplace=True)
y = df_model['Score'].copy(deep=True)
X_train, X_test, y_train, y_test = train_test_split(
    df_train, y, random_state=23102002, test_size=.5)


In [None]:
select = SelectFromModel(RandomForestClassifier(
    n_estimators=100, random_state=24))


In [None]:
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_l1.shape: {}".format(X_train_l1.shape))


In [None]:
mask = select.get_support()
# visualize the mask -- black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())


In [None]:
df_train = df_model.copy(deep=True)
df_train.drop(columns='Score', inplace=True)
y = df_model['Score'].copy(deep=True)
X_train, X_test, y_train, y_test = train_test_split(
    df_train, y, random_state=23102002, test_size=.5)


In [None]:
# RFE=Feature ranking with recursive feature elimination.

select = RFE(RandomForestClassifier(n_estimators=100,
             random_state=24), n_features_to_select=5)

select.fit(X_train, y_train)
# visualize the selected features:
mask = select.get_support()  # mask of selected features
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())


## 4. Linear Regression

In [None]:
from scipy import stats

def gradient_descent_2(eta, X, Y, numIterations):
    s = X.shape[0]
    theta = np.ones(9)
    X_transpose = X.transpose()
    for iter in range(0, numIterations):
        hypothesis = np.dot(X, theta)
        loss = hypothesis - Y
        gradient = np.dot(X_transpose, loss) / s
        theta = theta - eta * gradient

        y_predict = theta[0] + theta[1]*X_train
        plt.plot(X_train, y_predict, 'r')

    return theta


In [None]:
final_rmse = []

### (a) Train a Multiple Linear Regression model, using the Sklearn implementation of Linear Regression to find the best 𝜽 vector. Use all the transformed features, excluding the derived polynomial features. Evaluate the model with the learned 𝜽 on the test set.


In [None]:
df_train = df_model.copy(deep=True)
df_train.drop(columns='Score', inplace=True)
y=df_model['Score'].copy(deep=True)

X_train, X_test, y_train, y_test = train_test_split(
    df_train, y, random_state=24)




In [None]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)
print("Intercept={}, Slope={}".format(lin_reg.intercept_, lin_reg.coef_))
y_predict = lin_reg.predict(X_test)


In [None]:
sns.lineplot(y_test)
sns.lineplot(y_predict, color='r', label="predictions")
plt.xlabel('size')
plt.ylabel('Score')
plt.legend()


In [None]:
print("test RMSE={}".format(np.sqrt(mean_squared_error(y_test, y_predict))))
print("test R2={}".format(r2_score(y_test, y_predict)))

print("train RMSE={}".format(
    np.sqrt(mean_squared_error(y_train, lin_reg.predict(X_train)))))
print("train R2={}".format(r2_score(y_train, lin_reg.predict(X_train))))

final_rmse.append({'Linear Regression test': np.sqrt(mean_squared_error(y_test, y_predict))})
final_rmse.append({'Linear Regression train': np.sqrt(mean_squared_error(y_train, lin_reg.predict(X_train)))})



### (b) Use all the transformed features, excluding the derived polynomial features, to identify the best values of 𝜽 by means of a Batch Gradient Descent procedure. Identify the best values of 𝜼 (starting with an initial value of 𝜼 = 0.1 ). Evaluate the model with the trained 𝜽 on the test set. Plot the train and the test error for increasing number of iterations of the Gradient Descent procedure (with the best value of 𝜼). Provide a comment of the plot.


In [None]:
df_train = df_model.copy(deep=True)
df_train.drop(columns='Score', inplace=True)
y=df_model['Score'].copy(deep=True)

X_train, X_test, y_train, y_test = train_test_split(
    df_train, y, random_state=24)


In [None]:
X_train = np.c_[np.ones(X_train.shape[0]), X_train]  # insert column


In [None]:
s, q = np.shape(X_train)
eta = 0.01  # learning rate
theta = gradient_descent_2(eta, X_train, y_train, 100)
print(theta)
sns.scatterplot(x=X_train[:, 1], y=y_train)


In [None]:
s, q = np.shape(X_train)
eta = 0.1  # learning rate
theta = gradient_descent_2(eta, X_train, y_train, 100)
print(theta)
sns.scatterplot(x=X_train[:, 1], y=y_train)


In [None]:
from scipy import stats

s, q = np.shape(X_train)
eta = 0.2  # learning rate
theta = gradient_descent_2(eta, X_train, y_train, 100)
print(theta)
sns.scatterplot(x=X_train[:, 1], y=y_train)


In [None]:
from scipy import stats

s, q = np.shape(X_train)
eta = 0.3  # learning rate
theta = gradient_descent_2(eta, X_train, y_train, 100)
print(theta)
sns.scatterplot(x=X_train[:, 1], y=y_train)


In [None]:
from scipy import stats

s, q = np.shape(X_train)
eta = 0.4  # learning rate
theta = gradient_descent_2(eta, X_train, y_train, 100)
print(theta)
sns.scatterplot(x=X_train[:, 1], y=y_train)


The sweet spot is eta = 0.3

### (c) Use the complete set of features, including the derived polynomial features. Train a Multiple Linear Regression model, using the Sklearn implementation of Linear Regression to find the best 𝜽 vector. Evaluate the model with the learned 𝜽 on the test set. Plot the train and the test error for increasing the size of the train-set (with the best value of 𝜼). Provide a comment of the plot.


In [None]:
df_train = df_model.copy(deep=True)
y=df_model['Score'].copy(deep=True)
df_train.drop(columns=['Score'], inplace=True)

df_poly = pd.DataFrame(X_poly)

df_new_train = pd.concat([df_poly, df_train], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    df_new_train, y, random_state=24)


In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


In [None]:
sns.lineplot(y_train)
sns.lineplot(lin_reg.predict(X_test), label='prediction', color='r')


### (d) Use the complete set of features, including the derived polynomial features. Train a Ridge Regression model identifying the best value of the learning rate 𝜶 that allows the model to achieve the best generalization performances. Evaluate the model.


In [None]:
df_train = df_model.copy(deep=True)
y=df_model['Score'].copy(deep=True)
df_train.drop(columns=['Score'], inplace=True)

df_poly = pd.DataFrame(X_poly)

df_new_train = pd.concat([df_poly, df_train], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    df_new_train, y, random_state=24)


In [None]:
alphas = [0.5, 1, 2, 3, 4, 5, 6, 7, 10, 11,
          20, 30, 40, 50, 60, 70, 80, 90, 100,150,200]
rmse_values = []
rmse_values_train =[]

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_predict = ridge_model.predict(X_test)
    rmse_values.append(np.sqrt(mean_squared_error(y_test, y_predict)))
    rmse_values_train.append(np.sqrt(mean_squared_error(y_train, ridge_model.predict(X_train))))

plt.plot(alphas, rmse_values)
plt.xlabel('alpha')
plt.ylabel("RMSE")

for i, j in zip(alphas, rmse_values):
    print('Alpha = {}, RMSE = {}'.format(i, j))

print("Minimum test-RMSE = {}".format(np.min(rmse_values)))


final_rmse.append({'Ridge Regression test': np.min(rmse_values)})
final_rmse.append({'Ridge Regression train': np.min(rmse_values_train)})

### (e) Use the complete set of features, including the derived polynomial features. Train a Linear Regression model with Lasso regularization. Comment on the importance of each feature given the related trained parameter value of the trained model. Also, verify the number of features selected (related coefficient 𝜽 different from zero) with different values of 𝛼.


In [None]:
df_train = df_model.copy(deep=True)
y=df_model['Score'].copy(deep=True)
df_train.drop(columns=['Score'], inplace=True)

df_poly = pd.DataFrame(X_poly)

df_new_train = pd.concat([df_poly, df_train], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    df_new_train, y, random_state=24)


In [None]:
alphas = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006,
          0.008, 0.1, 0.2, 1, 1.4, 1.45, 1.5, 1.6, 2, 3]
rmse_values = []
rmse_values_train = []

for alpha in alphas:
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_predict = lasso_model.predict(X_test)
    rmse_values.append(np.sqrt(mean_squared_error(y_test, y_predict)))
    rmse_values_train.append(np.sqrt(mean_squared_error(y_train, ridge_model.predict(X_train))))

plt.plot(alphas, rmse_values)
plt.xlabel('alpha')
plt.ylabel("RMSE")

for i, j in zip(alphas, rmse_values):
    print('Alpha = {}, RMSE = {}'.format(i, j))

print("Minimum test-RMSE = {}".format(np.min(rmse_values)))

final_rmse.append({'Lasso Regression test': np.min(rmse_values)})
final_rmse.append({'Lasso Regression train': np.min(rmse_values_train)})

### (f) Use the subset of features selected in the Feature Selection task (question 3b). Train a Multiple Linear Regression model using the Sklearn implementation of Linear Regression to find the best 𝜽 vector. Evaluate the model.


In [None]:
df_train_features = df_model.copy(deep=True)
y=df_model['Score'].copy(deep=True)
df_train_features.drop(columns=['Score', 'Performance'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    df_train_features, y, random_state=24)


In [None]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)
print("Intercept={}, Slope={}".format(lin_reg.intercept_, lin_reg.coef_))

y_predict = lin_reg.predict(X_test)


In [None]:
sns.lineplot(y_test)
sns.lineplot(y_predict, color='r', label="predictions")
plt.xlabel('size')
plt.ylabel('Score')
plt.legend()


In [None]:
print("test RMSE={}".format(np.sqrt(mean_squared_error(y_test, y_predict))))
print("test R2={}".format(r2_score(y_test, y_predict)))

print("train RMSE={}".format(
    np.sqrt(mean_squared_error(y_train, lin_reg.predict(X_train)))))
print("train R2={}".format(r2_score(y_train, lin_reg.predict(X_train))))



final_rmse.append({'Linear Regression features test': np.sqrt(mean_squared_error(y_test, y_predict))})
final_rmse.append({'Linear Regression features train': np.sqrt(mean_squared_error(y_train, lin_reg.predict(X_train)))})

### (g) Create a table with the evaluation results obtained from all the models above on both the train and test sets.

In [None]:
def plot_learning_curves(model, X, y, model_type, c):
  
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
  train_errors, test_errors = [], []
  
  for m in range(1, len(X_train)):

    model.fit(X_train[:m], y_train[:m])
    y_train_predict = model.predict(X_train[:m])
    y_test_predict = model.predict(X_test)

    train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
    test_errors.append(mean_squared_error(y_test, y_test_predict))

  plt.plot(np.sqrt(train_errors), 'b', linewidth=2, label="train_"+model_type)
  plt.plot(np.sqrt(test_errors), 'g' , linewidth=3, label="test_"+model_type)

In [None]:
df_train = df_model.copy(deep=True)
y=df_model['Score'].copy(deep=True)
df_train.drop(columns=['Score'], inplace=True)

df_poly = pd.DataFrame(X_poly)

df_new_train = pd.concat([df_poly, df_train], axis=1).copy(deep=True)

X_train, X_test, y_train, y_test = train_test_split(
    df_train, y, random_state=24)


In [None]:
plt.figure(figsize=(10, 10))
lin_reg = LinearRegression()
plot_learning_curves(lin_reg, df_train, y, "LinearRegression", 'g')
plt.title('Linear Regression, learning curve')
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.legend()

In [None]:
df_train = df_model.copy(deep=True)
y=df_model['Score'].copy(deep=True)
df_train.drop(columns=['Score'], inplace=True)

df_poly = pd.DataFrame(X_poly)

df_new_train = pd.concat([df_poly, df_train], axis=1).copy(deep=True)

X_train, X_test, y_train, y_test = train_test_split(
    df_new_train, y, random_state=24)


In [None]:
plt.figure(figsize=(10, 10))
lin_reg = Lasso(alpha=3)
plot_learning_curves(lin_reg, df_new_train, y, "Lasso", 'g')
plt.title('Lasso Regression, learning curve (alpha=3)')
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.legend()

In [None]:
df_train_features = df_model.copy(deep=True)
y=df_model['Score'].copy(deep=True)
df_train_features.drop(columns=['Score','Performance'], inplace=True)

df_poly = pd.DataFrame(X_poly)

df_new_train = pd.concat([df_poly, df_train_features], axis=1).copy(deep=True)

X_train, X_test, y_train, y_test = train_test_split(
    df_train_features, y, random_state=24)

In [None]:
plt.figure(figsize=(10, 10))
lin_reg = Ridge(alpha=60, solver="cholesky")
plot_learning_curves(lin_reg, df_new_train, y, "Ridge", 'g')
plt.title('Ridge Regression, learning curve (alpha=60)')
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.legend()

In [None]:
df_train_features = df_model.copy(deep=True)
y=df_model['Score'].copy(deep=True)
df_train_features.drop(columns=['Score','Performance'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    df_train_features, y, random_state=24)


In [None]:
plt.figure(figsize=(10, 10))
lin_reg = LinearRegression()
plot_learning_curves(lin_reg, df_train_features, y, "LinearRegression", 'g')
plt.title('Linear Regression, Features, learning curve')
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.legend()

In [None]:
final_rmse

### (h) Compare and discuss the results obtained above.