In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
  
# reading the csv file
dataset = "https://raw.githubusercontent.com/Negatix092/Machine-Learning/main/ML_Class_Spring_2023/Proyecto_Final/StudentsPerformance.csv"
df = pd.read_csv(dataset)

df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Dataset

Este dataset tiene un total de datos correspondiente a 1000 alumnos, con 5 características de los mismos. Los outputs que nos interesan son 3 y representan las últimas 3 filas del dataset: calificación de matemáticas, calificación de lectura y calificación de escritura.

#Detalle del Dataset

* ```gender``` -- ['male', 'female']
* ```race/ethnicity``` -- ['group A', 'group B', 'group C', 'group D', 'group E']
* ```parental level of education``` -- ['some high school', 'high school', "bachelor's degree", 'some college', "master's degree", "associate's degree"]
* ```lunch``` -- ['standard', 'free/reduced']
* ```test preparation course``` -- ['none', 'completed']
* ```math score``` - Student unique math score
* ```reading score``` - Student unique reading score
* ```writing score``` - Student unique writin score

In [10]:
#preparacion del dataset
from sklearn.metrics import classification_report  
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

#outputs
score_output_Math = 'math score'
score_output_Reading = 'reading score'
score_output_Writing = 'writing score'

#en la lista binary_feat se almacenan los nombres de las columnas (del dataset) que solamente tienen dos valores diferentes.
binary_feat = df.nunique()[df.nunique() == 2].keys().tolist()
#en la lista numeric_feat se almacenan los nombres de las columnas (del dataset) que solamente contienen datos numericos, sean tipo int o float, y que no tienen solo dos valores diferentes
numeric_feat = [col for col in df.select_dtypes(['float','int']).columns.tolist() if col not in binary_feat]
#en la lista categorical_feat se almacenan los nombres de las columnas (del dataset) que tienen datos no numericos y que no tienen solo dos valores diferentes.
categorical_feat = [ col for col in df.select_dtypes('object').columns.to_list() if col not in binary_feat + numeric_feat ]
#una copa del dataframe (dataset) se almacena en un nuevo dataframe para trabajarlo, df_proc
df_proc = df.copy()

#Etiquetas para características binarias
#aqui se transforman los binary features (solo dos valores diferentes) a valores numericos usando LabelEncoder
le = LabelEncoder()
for i in binary_feat:
  df_proc[i] = le.fit_transform(df_proc[i])
  print(i, '\n', np.unique(df_proc[i].values))

def get_df_size(df, header='Dataset dimensions'):
  print(header,
        '\n# Attributes: ', df.shape[1], 
        '\n# Entries: ', df.shape[0],'\n')
  
#Dummy variables
#se crean variables dummy para las categorical features
#a cada categorical feature se la convierte en multiples binary features, con un valor posible (0 o 1) para cada categorical feature original
# es decir, one-hot encoding se aplica a las categorical features
get_df_size(df_proc, header='Processed dataset before dummies:')
df_proc = pd.get_dummies(df_proc, columns=categorical_feat)
get_df_size(df, header='Original dataset:')
get_df_size(df_proc, header='Processed dataset:')
df_proc.head()

gender 
 [0 1]
lunch 
 [0 1]
test preparation course 
 [0 1]
Processed dataset before dummies: 
# Attributes:  8 
# Entries:  1000 

Original dataset: 
# Attributes:  8 
# Entries:  1000 

Processed dataset: 
# Attributes:  17 
# Entries:  1000 



Unnamed: 0,gender,lunch,test preparation course,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school
0,0,1,1,72,72,74,False,True,False,False,False,False,True,False,False,False,False
1,0,1,0,69,90,88,False,False,True,False,False,False,False,False,False,True,False
2,0,1,1,90,95,93,False,True,False,False,False,False,False,False,True,False,False
3,1,0,1,47,57,44,True,False,False,False,False,True,False,False,False,False,False
4,1,1,1,76,78,75,False,False,True,False,False,False,False,False,False,True,False


In [11]:
# inputs y outputs
math_score_Y = df_proc['math score']
reading_score_Y = df_proc['reading score']
writing_score_Y = df_proc['writing score']

all_X=df_proc.drop(columns =['math score','reading score','writing score'], axis=1)

#all_Y=df_proc['math score','reading score', 'writing score']

# Dividimos el conjunto de entrenamiento y test
all_X_values_train = all_X[:-200] 
all_X_values_test = all_X[-200:] #last 200 for test

#split the targets into training/testing sets
math_score_Y_train = math_score_Y[:-200]
math_score_Y_test = math_score_Y[-200:] 

In [16]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from scipy.stats import ttest_rel


#ref: https://scikit-learn.org/stable/modules/model_evaluation.html

#tenemos que usar pipelines, entonces cada tecnica de ML tendra su pipeline
#como uno de los requisitos es que al menos una tecnica de ML sea Regresion Lineal Regularizada
#hacemos el pipe para Lasso
# define pipelines

pipeline_L = Pipeline([
    
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(score_func=f_regression)),
    ('poly', PolynomialFeatures()),
    #('feature_selection', SelectFromModel(Lasso())),
    ('model', Lasso())
])

#hacemos el pipe para Ridge, con el fin de compararlo con Lasso
pipeline_R = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(score_func=f_regression)),
    ('poly', PolynomialFeatures()),
    #('feature_sel', SelectFromModel(Lasso())),
    ('model', Ridge())
])

#estos parametros son necesarios para realizar la optimizacion del modelo(encontrar los mejores hiperparametros)
p_L = {'selector__k': range(1, 6),
       'poly__degree':[0,1,2], 
       'model__alpha': [0.01, 0.1, 1, 10]}
       #'model__max_iter': [1000, 5000],
       #'model__tol': [1e-3, 1e-4]}

p_R = {'selector__k': range(1, 6),
       'poly__degree':[0,1,2], 
       'model__alpha': [0.01, 0.1, 1, 10]}
       #'model__max_iter': [1000, 5000],
       #'model__tol': [1e-3, 1e-4]}

#repeated k-fold cross validation
#Consulte qué quiere decir estratificado y por qué usted debe o no usarlo en su problema
#
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=26)

#se usa grid search para optimizar los parametros, el conjunto de validacion se utiliza repeated stratified k fold
lasso_search = GridSearchCV(pipeline_L, p_L, cv=10, scoring='r2')
ridge_search = GridSearchCV(pipeline_R, p_R, cv=10, scoring='r2')

#se hace el fit de cada regresion
result_lasso = lasso_search.fit(all_X_values_train, math_score_Y_train)
result_ridge = ridge_search.fit(all_X_values_train, math_score_Y_train)

#se quiere encontrar los mejores hiperparametros
best_hp_lasso = result_lasso.best_params_
best_hp_ridge = result_ridge.best_params_
print("Los mejores hiperparámetros para Lasso son:", best_hp_lasso)
print("Los mejores hiperparámetros para Ridge son:", best_hp_ridge) 

Los mejores hiperparámetros para Lasso son: {'model__alpha': 0.1, 'poly__degree': 1, 'selector__k': 4}
Los mejores hiperparámetros para Ridge son: {'model__alpha': 10, 'poly__degree': 1, 'selector__k': 4}


In [21]:
from sklearn.model_selection import cross_val_score
#conociendo ya los modelos con sus mejores parametros, se vuelve a definir sus pipelines
pipeline_L_best = Pipeline([
    
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(score_func=f_regression, k=best_hp_lasso['selector__k'])),
    ('poly', PolynomialFeatures(degree=best_hp_lasso['poly__degree'])),
    #('model', Lasso(alpha=best_hp_lasso['model__alpha'], max_iter=best_hp_lasso['model__max_iter'], tol=best_hp_lasso['model__tol']))
    ('model', Lasso(alpha=best_hp_lasso['model__alpha']))
])

#hacemos el pipe para Ridge, con el fin de compararlo con Lasso
pipeline_R_best = Pipeline([
    
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(score_func=f_regression, k=best_hp_ridge['selector__k'])),
    ('poly', PolynomialFeatures(degree=best_hp_ridge['poly__degree'])),
    #('model', Ridge(alpha=best_hp_ridge['model__alpha'], max_iter=best_hp_ridge['model__max_iter'], tol=best_hp_ridge['model__tol']))
    ('model', Ridge(alpha=best_hp_ridge['model__alpha']))
])

#se desea usar RepeatedStratifiedKfold crossvalidation
rkf = RepeatedStratifiedKFold(n_splits=10,n_repeats=10, random_state=26)

lasso_scores = cross_val_score(pipeline_L_best, all_X_values_train, math_score_Y_train, cv=rkf)
ridge_scores = cross_val_score(pipeline_R_best, all_X_values_train, math_score_Y_train, cv=rkf)

print("Lasso: %0.2f (+/- %0.2f)" % (lasso_scores.mean(), lasso_scores.std() * 2))
print("Ridge: %0.2f (+/- %0.2f)" % (ridge_scores.mean(), ridge_scores.std() * 2))   



Lasso: 0.19 (+/- 0.15)
Ridge: 0.19 (+/- 0.15)


In [28]:
#para cada fold realizado en el cross validation, se calcula el MSE, el R2 y el RMSE
#esto para los mejores modelos de lasso y de ridge
for i in enumerate(rkf.split(all_X_values_train, math_score_Y_train)):
    X_fold_train, Y_fold_train = all_X_values_train.iloc[i[1][0]], math_score_Y_train.iloc[i[1][0]]
    X_fold_test, Y_fold_test = all_X_values_train.iloc[i[1][1]], math_score_Y_train.iloc[i[1][1]]

    pipeline_L_best.fit(X_fold_train, Y_fold_train)
    pipeline_R_best.fit(X_fold_train, Y_fold_train)

    Y_pred_L = pipeline_L_best.predict(X_fold_test)
    Y_pred_R = pipeline_R_best.predict(X_fold_test)

    lasso_mse = mean_squared_error(Y_fold_test, Y_pred_L)
    ridge_mse = mean_squared_error(Y_fold_test, Y_pred_R)

    lasso_rmse = np.sqrt(lasso_mse)
    ridge_rmse = np.sqrt(ridge_mse)

    lasso_r2 = r2_score(Y_fold_test, Y_pred_L)
    ridge_r2 = r2_score(Y_fold_test, Y_pred_R)

    print(f"Fold {i[0]}")

    print("Metricas de Lasso:")
    print(f"MSE: {lasso_mse}")
    print(f"R2 score: {lasso_r2}")
    print(f"RMSE: {lasso_rmse}")

    print("Metricas de Ridge:")
    print(f"MSE: {ridge_mse}")
    print(f"R2 score: {ridge_r2}")
    print(f"RMSE: {ridge_rmse}")
    print("---------------------------------")



Fold 0
Metricas de Lasso:
MSE: 175.56010296517795
R2 score: 0.1563438321426136
RMSE: 13.249909545547016
Metricas de Ridge:
MSE: 176.04220765963618
R2 score: 0.15402707228565815
RMSE: 13.268089827086497
---------------------------------
Fold 1
Metricas de Lasso:
MSE: 181.3955177108645
R2 score: 0.2214868179059688
RMSE: 13.468315325639821
Metricas de Ridge:
MSE: 181.55702970085886
R2 score: 0.22079364083707753
RMSE: 13.474309989786448
---------------------------------
Fold 2
Metricas de Lasso:
MSE: 169.6615843381738
R2 score: 0.26676376681395797
RMSE: 13.025420697166515
Metricas de Ridge:
MSE: 169.5457658693089
R2 score: 0.2672643061562915
RMSE: 13.020974075287489
---------------------------------
Fold 3
Metricas de Lasso:
MSE: 167.29587172693982
R2 score: 0.18479494576503885
RMSE: 12.934290538214295
Metricas de Ridge:
MSE: 167.3251481367741
R2 score: 0.18465228667237643
RMSE: 12.935422224913035
---------------------------------
Fold 4
Metricas de Lasso:
MSE: 175.12427149710362
R2 score:



Fold 51
Metricas de Lasso:
MSE: 182.39808195668925
R2 score: 0.21718401323295145
RMSE: 13.505483403295464
Metricas de Ridge:
MSE: 182.36051442273774
R2 score: 0.21734524555428492
RMSE: 13.504092506449211
---------------------------------
Fold 52
Metricas de Lasso:
MSE: 167.46003307304917
R2 score: 0.2762783376173782
RMSE: 12.940634956332289
Metricas de Ridge:
MSE: 167.1517801695376
R2 score: 0.27761053193067065
RMSE: 12.928719200660892
---------------------------------
Fold 53
Metricas de Lasso:
MSE: 181.25777394573618
R2 score: 0.11676091038803627
RMSE: 13.463200731836995
Metricas de Ridge:
MSE: 181.73776112616022
R2 score: 0.11442201241398275
RMSE: 13.481014840365699
---------------------------------
Fold 54
Metricas de Lasso:
MSE: 162.10617805704686
R2 score: 0.27128911002445
RMSE: 12.73209244613967
Metricas de Ridge:
MSE: 161.75262284059738
R2 score: 0.27287843585719984
RMSE: 12.718200456062855
---------------------------------
Fold 55
Metricas de Lasso:
MSE: 179.314446260779
R2 sc



Fold 77
Metricas de Lasso:
MSE: 148.3520847557296
R2 score: 0.25403829659648014
RMSE: 12.179987058931122
Metricas de Ridge:
MSE: 148.42075361387577
R2 score: 0.2536930076274856
RMSE: 12.182805654440843
---------------------------------
Fold 78
Metricas de Lasso:
MSE: 240.3508724173186
R2 score: 0.09794655308696443
RMSE: 15.503253607463131
Metricas de Ridge:
MSE: 240.73426344872513
R2 score: 0.09650766003066913
RMSE: 15.515613537618329
---------------------------------
Fold 79
Metricas de Lasso:
MSE: 207.23492416970777
R2 score: 0.13902727497880207
RMSE: 14.395656434136923
Metricas de Ridge:
MSE: 207.21323058913504
R2 score: 0.1391174025538524
RMSE: 14.394902937815699
---------------------------------
Fold 80
Metricas de Lasso:
MSE: 160.9776529992834
R2 score: 0.22641996930823616
RMSE: 12.687696914699822
Metricas de Ridge:
MSE: 160.76707914826937
R2 score: 0.22743188445978235
RMSE: 12.679395851075451
---------------------------------
Fold 81
Metricas de Lasso:
MSE: 188.34732611519107
R2



In [29]:
from scipy.stats import wilcoxon

#diferencia entre cross_val_scores de lasso y ridge
score_diff = lasso_scores - ridge_scores

#con la diferencia se hace el test de wilcoxon
statistic, pvalue = wilcoxon(score_diff)

print("Test de Wilcoxon:")
print(f"Statistic: {statistic}")
print(f"P-value: {pvalue}")

Test de Wilcoxon:
Statistic: 1975.0
P-value: 0.058613844187400045


In [31]:
from scipy.stats import ttest_rel

#Es posible usar ttest_rel en lugar de wilcoxon
#se obtiene el t_value y el p_value
#t_value es la diferencia entre las medias de los scores dividida por la desviacion estandar de la diferencia
#p_value es la probabilidad de que la diferencia entre las medias sea 0.05
t_value, p_value = ttest_rel(lasso_scores, ridge_scores)

print("t-value:", t_value)
print("p-value:", p_value)

t-value: -1.5992350674988696
p-value: 0.11295509743115753


In [32]:
#se importan las metricas de sklearn
from sklearn.metrics import mean_squared_error, r2_score

#se reentrena con los mejores hiperparametros encontrados
pipeline_L_best.fit(all_X_values_train, math_score_Y_train)
pipeline_R_best.fit(all_X_values_train, math_score_Y_train)

# Make predictions on the test set for both models
lasso_preds = pipeline_L_best.predict(all_X_values_test)
ridge_preds = pipeline_R_best.predict(all_X_values_test)

# Calculate the MSE for both models
lasso_mse = mean_squared_error(math_score_Y_test, lasso_preds)
ridge_mse = mean_squared_error(math_score_Y_test, ridge_preds)

# Calculate the R2 score for both models
lasso_r2 = r2_score(math_score_Y_test, lasso_preds)
ridge_r2 = r2_score(math_score_Y_test, ridge_preds)

# Calculate the RMSE for both models
lasso_rmse = mean_squared_error(math_score_Y_test, lasso_preds, squared=False)
ridge_rmse = mean_squared_error(math_score_Y_test, ridge_preds, squared=False)

print("Lasso metrics:")
print(f"MSE: {lasso_mse}")
print(f"R2 score: {lasso_r2}")
print(f"RMSE: {lasso_rmse}")

print("Ridge metrics:")
print(f"MSE: {ridge_mse}")
print(f"R2 score: {ridge_r2}")
print(f"RMSE: {ridge_rmse}")

Lasso metrics:
MSE: 187.77807999329013
R2 score: 0.2302915232280287
RMSE: 13.7032142212435
Ridge metrics:
MSE: 187.73858926165792
R2 score: 0.2304533970255046
RMSE: 13.701773215962156
