In [30]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import sklearn 
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.model_selection import cross_val_score
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# dataset: https://www.kaggle.com/datasets/barkhaverma/student-performance

In [31]:
# importing and understanding of data.
data_names = ['school', 'sex', 'age', 'address', 'family_size', 'parent_cohabition_status', 'mother_education', 'father_education','mother_job', 'father_job', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'school_support', 'family_support', 'paid', 'extra_curricular_activities', 'nursery', 'higher', 'internet_access', 'romantic', 'famrel', 'freetime', 'going_out', 'workday_alcohol', 'weekend_alcohol', 'health', 'absences', 'grade_period1', 'grade_period2', 'final_grade' ]
data = pd.read_csv('/content/student-por.csv', sep=",")
data.columns = data_names
dataframe = pd.DataFrame(data, columns=data_names)
dataframe.head(10)
# data.describe(['age'])
# data['grade_period1'].describe()
# data = data.drop_duplicates()

# print('Maximum grade period one:', data['grade_period1'].max())
# print('Minimum age:', data['age'].min())

Unnamed: 0,school,sex,age,address,family_size,parent_cohabition_status,mother_education,father_education,mother_job,father_job,...,famrel,freetime,going_out,workday_alcohol,weekend_alcohol,health,absences,grade_period1,grade_period2,final_grade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,6,12,12,13
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,13,12,13
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,2,10,13,13
8,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,15,16,17
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,12,12,13


In [32]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

# the final grade seems to be most negatively correlated with number of failures in past classes and also weekday alcohol consumption
# it seems to be most positively correlated with studytime, mother's education (almost as much as studytime!), but mainly grades from previos periods. 

Unnamed: 0,age,mother_education,father_education,traveltime,studytime,failures,famrel,freetime,going_out,workday_alcohol,weekend_alcohol,health,absences,grade_period1,grade_period2,final_grade
age,1.0,-0.11,-0.12,0.03,-0.01,0.32,-0.02,-0.0,0.11,0.13,0.09,-0.01,0.15,-0.17,-0.11,-0.11
mother_education,-0.11,1.0,0.65,-0.27,0.1,-0.17,0.02,-0.02,0.01,-0.01,-0.02,0.0,-0.01,0.26,0.26,0.24
father_education,-0.12,0.65,1.0,-0.21,0.05,-0.17,0.02,0.01,0.03,0.0,0.04,0.04,0.03,0.22,0.23,0.21
traveltime,0.03,-0.27,-0.21,1.0,-0.06,0.1,-0.01,0.0,0.06,0.09,0.06,-0.05,-0.01,-0.15,-0.15,-0.13
studytime,-0.01,0.1,0.05,-0.06,1.0,-0.15,-0.0,-0.07,-0.08,-0.14,-0.21,-0.06,-0.12,0.26,0.24,0.25
failures,0.32,-0.17,-0.17,0.1,-0.15,1.0,-0.06,0.11,0.05,0.11,0.08,0.04,0.12,-0.38,-0.39,-0.39
famrel,-0.02,0.02,0.02,-0.01,-0.0,-0.06,1.0,0.13,0.09,-0.08,-0.09,0.11,-0.09,0.05,0.09,0.06
freetime,-0.0,-0.02,0.01,0.0,-0.07,0.11,0.13,1.0,0.35,0.11,0.12,0.08,-0.02,-0.09,-0.11,-0.12
going_out,0.11,0.01,0.03,0.06,-0.08,0.05,0.09,0.35,1.0,0.25,0.39,-0.02,0.09,-0.07,-0.08,-0.09
workday_alcohol,0.13,-0.01,0.0,0.09,-0.14,0.11,-0.08,0.11,0.25,1.0,0.62,0.06,0.17,-0.2,-0.19,-0.2


In [33]:
# Lasso Regression is a popular type of regularized linear regression that includes an L1 penalty.
# This penalty allows some coefficient values to go to the value of zero, allowing input variables to be effectively removed from the model, providing a type of automatic feature selection.

# We start with the numerical features: 

numerical_columns = ['age', 'mother_education', 'father_education', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'going_out', 'workday_alcohol', 'weekend_alcohol', 'health', 'absences', 'grade_period1', 'grade_period2', 'final_grade']
# print(len(numerical_columns))
X = data[numerical_columns]   # size: 10384
y = data['final_grade']       # size: 649


# 80/20 split, random_state: Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)  

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

lasso_model = Lasso(copy_X = True)

lasso_model.fit(X_train, y_train)
yhat_lasso = lasso_model.predict(X_test)
# print(yhat_lasso)

mse = mean_squared_error(y_test, yhat_lasso)   # the average squared difference between the estimated value and the actual value

print("Mean squared error on test set: %.3f" % mse)



Mean squared error on test set: 0.144


In [34]:
# Now we make the same thing with all features.

categorical_columns = ['school', 'sex', 'family_size', 'parent_cohabition_status', 'mother_job', 'father_job', 'reason', 'guardian', 'school_support', 'family_support', 'paid', 'extra_curricular_activities', 'nursery', 'higher', 'internet_access', 'romantic']
# print(len(categorical_columns))
encoded_data = pd.DataFrame()
for column in categorical_columns:
  LE = LabelEncoder()
  dt = LE.fit_transform(data[column])
  encoded_data[column] = dt

for column in numerical_columns:
  encoded_data[column] = data[column]

cat_and_num = categorical_columns + numerical_columns
# print(len(cat_and_num))

scaler = StandardScaler()
scaled_data = scaler.fit_transform(encoded_data)
scaled_dataframe = pd.DataFrame(scaled_data, columns = cat_and_num)
scaled_dataframe.head()


Unnamed: 0,school,sex,family_size,parent_cohabition_status,mother_job,father_job,reason,guardian,school_support,family_support,...,famrel,freetime,going_out,workday_alcohol,weekend_alcohol,health,absences,grade_period1,grade_period2,final_grade
0,-0.730944,-0.833377,-0.648175,-2.666927,-1.556453,2.057248,-0.933974,0.332648,2.923032,-1.259229,...,0.072606,-0.171647,0.693785,-0.543555,-0.997695,-0.371042,0.073433,-4.15547,-0.19582,-0.280658
1,-0.730944,-0.833377,-0.648175,0.374963,-1.556453,-0.260728,-0.933974,-1.594926,-0.34211,0.794137,...,1.119748,-0.171647,-0.15738,-0.543555,-0.997695,-0.371042,-0.357863,-0.87457,-0.19582,-0.280658
2,-0.730944,-0.833377,1.542792,0.374963,-1.556453,-0.260728,0.745109,0.332648,2.923032,-1.259229,...,0.072606,-0.171647,-1.008546,0.538553,0.560678,-0.371042,0.50473,0.219064,0.491137,0.029116
3,-0.730944,-0.833377,-0.648175,0.374963,-0.754756,0.89826,-0.094432,0.332648,-0.34211,0.794137,...,-0.974536,-1.123771,-1.008546,-0.543555,-0.997695,1.012903,-0.789159,0.948153,0.834615,0.648663
4,-0.730944,-0.833377,-0.648175,0.374963,0.046941,-0.260728,-0.094432,-1.594926,-0.34211,0.794137,...,0.072606,-0.171647,-1.008546,-0.543555,-0.218508,1.012903,-0.789159,-0.145481,0.491137,0.338889


In [35]:
corr = encoded_data.corr()
corr.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,school,sex,family_size,parent_cohabition_status,mother_job,father_job,reason,guardian,school_support,family_support,paid,extra_curricular_activities,nursery,higher,internet_access,romantic,age,mother_education,father_education,traveltime,studytime,failures,famrel,freetime,going_out,workday_alcohol,weekend_alcohol,health,absences,grade_period1,grade_period2,final_grade
school,1.0,-0.08,0.02,0.03,-0.21,-0.08,-0.11,-0.06,-0.12,-0.06,-0.01,-0.09,0.0,-0.14,-0.24,0.07,0.09,-0.25,-0.21,0.25,-0.14,0.11,-0.03,0.03,0.04,0.05,0.01,-0.06,-0.16,-0.29,-0.27,-0.28
sex,-0.08,1.0,0.1,0.06,0.15,0.08,0.01,-0.04,-0.11,-0.13,0.08,0.12,-0.04,-0.06,0.07,-0.11,-0.04,0.12,0.08,0.04,-0.21,0.07,0.08,0.15,0.06,0.28,0.32,0.14,0.02,-0.1,-0.1,-0.13
family_size,0.02,0.1,1.0,-0.24,0.02,-0.06,0.03,0.0,-0.06,-0.04,-0.05,-0.01,0.1,0.0,0.01,-0.03,-0.0,-0.01,-0.04,0.01,-0.01,-0.07,0.0,-0.02,-0.0,0.06,0.08,0.0,0.0,0.05,0.04,0.05
parent_cohabition_status,0.03,0.06,-0.24,1.0,-0.03,0.05,-0.03,-0.17,-0.01,0.01,0.02,0.1,-0.03,0.02,0.06,-0.05,-0.01,-0.06,-0.03,0.04,-0.01,-0.01,0.05,0.04,0.03,0.04,0.07,0.01,-0.12,0.02,0.02,-0.0
mother_job,-0.21,0.15,0.02,-0.03,1.0,0.2,0.06,0.01,0.01,0.04,0.01,0.1,0.04,0.15,0.26,-0.07,-0.07,0.46,0.29,-0.16,0.06,-0.12,0.03,0.05,0.0,0.05,0.03,0.08,0.03,0.18,0.15,0.15
father_job,-0.08,0.08,-0.06,0.05,0.2,1.0,0.04,-0.08,-0.01,-0.04,-0.02,0.02,-0.05,0.09,0.09,-0.0,-0.05,0.15,0.21,0.0,-0.02,-0.06,0.04,-0.04,-0.03,0.06,0.04,-0.03,-0.05,0.11,0.09,0.05
reason,-0.11,0.01,0.03,-0.03,0.06,0.04,1.0,-0.07,0.05,0.01,-0.05,0.08,0.04,0.09,0.11,-0.05,-0.03,0.13,0.08,-0.09,0.14,-0.14,0.04,-0.05,-0.01,-0.01,0.01,-0.12,0.02,0.16,0.16,0.12
guardian,-0.06,-0.04,0.0,-0.17,0.01,-0.08,-0.07,1.0,-0.04,0.02,0.07,0.01,-0.02,-0.11,-0.0,0.11,0.27,-0.01,-0.1,0.03,-0.01,0.17,-0.05,0.05,0.05,0.02,-0.01,0.02,0.15,-0.12,-0.1,-0.08
school_support,-0.12,-0.11,-0.06,-0.01,0.01,-0.01,0.05,-0.04,1.0,0.08,0.04,-0.03,0.02,0.09,-0.03,-0.09,-0.17,-0.02,0.02,-0.04,0.09,-0.0,-0.01,-0.02,-0.06,-0.03,-0.1,0.02,-0.06,-0.07,-0.06,-0.07
family_support,-0.06,-0.13,-0.04,0.01,0.04,-0.04,0.01,0.02,0.08,1.0,0.09,-0.01,0.03,0.09,0.07,-0.02,-0.1,0.12,0.14,-0.04,0.14,-0.01,0.02,0.0,0.02,-0.02,-0.07,0.02,0.04,0.04,0.04,0.06


In [36]:
X_encoded = encoded_data       # all attributes, size: 20768
y_encoded = encoded_data["final_grade"]

X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=1)  

lasso_model_total = Lasso(copy_X = True, alpha = 0.1)   # A LASSO model with all the features

lasso_model_total.fit(X_train_encoded, y_train_encoded)
yhat_tot_lasso = lasso_model_total.predict(X_test_encoded)
# print(yhat_tot_lasso)

In [37]:
mse_tot = mean_squared_error(y_test_encoded, yhat_tot_lasso)   

print("Mean squared error on test set: %.10f" % mse_tot)

rmse_tot = mean_squared_error(y_test_encoded, yhat_tot_lasso, squared = False)  

print("Root mean squared error on test set: %.10f" % rmse_tot)  # RMSE measures the average difference between values predicted by a model and the actual values.
# The lower the RMSE, the better a given model is able to “fit” a dataset.

r2_tot_score = r2_score(y_test_encoded, yhat_tot_lasso)   # Coefficient of determination. Best possible r2 score is 1.0
print("R^2 score on the test set_ %.10f" % r2_tot_score)

cross_val = cross_val_score(lasso_model_total, X, y, cv=10,scoring=None).mean()

print("Cross-validation accuracy score:", cross_val) 
# Splits the data into 10 subsets, trains the model on 9 subsets and tests its performance on the remaining subset, and repeats this process 10 times, each time using a different subset for testing. The resulting cross-validation score is the average accuracy of the model across all 10 folds.

Mean squared error on test set: 0.0014372667
Root mean squared error on test set: 0.0379113009
R^2 score on the test set_ 0.9998929913
Cross-validation accuracy score: 0.9998886713755042


In [38]:
coefficients = lasso_model.coef_
# print(coefficients)
important_features = X.columns[coefficients != 0]
print(data[important_features])

"""feature_coefficients = pd.DataFrame(list(zip(important_features, coefficients[coefficients != 0])), columns=['feature', 'coefficient'])
print(feature_coefficients)

print(feature_coefficients)"""

     final_grade
0             11
1             11
2             12
3             14
4             13
..           ...
644           10
645           16
646            9
647           10
648           11

[649 rows x 1 columns]


"feature_coefficients = pd.DataFrame(list(zip(important_features, coefficients[coefficients != 0])), columns=['feature', 'coefficient'])\nprint(feature_coefficients)\n\nprint(feature_coefficients)"