In [1]:
import warnings

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score

import factor_analyzer as fact

import funciones_auxiliares_hito_3 as aux3
from funciones_auxiliares_hito_2 import preprocesar_y_recodificar_enunciado_dos

# warnings.filterwarnings('ignore')
sns.set_theme()
plt.rcParams['figure.figsize'] = (12, 6)

In [2]:
df = pd.read_csv('students.csv', delimiter='|')
df = preprocesar_y_recodificar_enunciado_dos(df)

# Analisis de factores latentes en variables famrel:health

In [3]:
posibles_factores_latentes = df.loc[:, 'famrel':'health']
print(fact.calculate_bartlett_sphericity(posibles_factores_latentes))

(264.4978629665067, 6.03816710126852e-48)


Por lo tanto, existe al menos una correlacion entre dos variables que es distinto de 0

In [4]:
fact.calculate_kmo(posibles_factores_latentes)

(array([0.50173343, 0.57692329, 0.61683967, 0.58664172, 0.5554092 ,
        0.59265861]),
 0.5767240731164592)

Como el valor del test de Kaiser Meyer Olkins es menor a 0.7, entonces NO existe un factor latente
en nuestros datos.

# Modelo para G1

In [7]:
modelo_g1 = aux3.crear_modelo_lineal(df.drop(columns=['G2', 'G3']), 'G1')
modelo_g1.summary()

0,1,2,3
Dep. Variable:,G1,R-squared:,0.331
Model:,OLS,Adj. R-squared:,0.224
Method:,Least Squares,F-statistic:,3.09
Date:,"Sat, 04 Feb 2023",Prob (F-statistic):,5.75e-08
Time:,23:08:48,Log-Likelihood:,-683.63
No. Observations:,284,AIC:,1447.0
Df Residuals:,244,BIC:,1593.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.6531,3.452,2.797,0.006,2.855,16.452
school,-0.3886,0.666,-0.583,0.560,-1.701,0.924
sex,0.7631,0.432,1.768,0.078,-0.087,1.613
age,0.0772,0.185,0.418,0.677,-0.287,0.441
address,-0.0205,0.487,-0.042,0.966,-0.980,0.939
famsize,0.3125,0.440,0.710,0.478,-0.555,1.180
Pstatus,0.1872,0.650,0.288,0.774,-1.094,1.468
Medu,0.1809,0.282,0.642,0.521,-0.374,0.736
Fedu,0.1527,0.236,0.646,0.519,-0.313,0.618

0,1,2,3
Omnibus:,6.431,Durbin-Watson:,2.003
Prob(Omnibus):,0.04,Jarque-Bera (JB):,4.178
Skew:,0.128,Prob(JB):,0.124
Kurtosis:,2.464,Cond. No.,406.0


Al realizar un modelo de regresion lineal con todas las variables y G1 como vector objetivo se
obtuvo lo siguiente:

- El modelo presenta un $R^2$ = 0.331. Por lo tanto, las variables explican un 33.1% de la variacion
de ```G1```
- Solo el intercepto, ```studytime```, ```failures```, ```schoolsup```, ```famsup```, ```goout``` y ```health``` afectan significativamente a la variable ```G1```
- Obtuvo un Log-Likelihood de -683 

# Modelo para G2

In [9]:
modelo_g2 = aux3.crear_modelo_lineal(df.drop(columns=['G1', 'G3']), 'G2')
modelo_g2.summary()

0,1,2,3
Dep. Variable:,G2,R-squared:,0.301
Model:,OLS,Adj. R-squared:,0.189
Method:,Least Squares,F-statistic:,2.693
Date:,"Sat, 04 Feb 2023",Prob (F-statistic):,2.14e-06
Time:,23:14:57,Log-Likelihood:,-729.98
No. Observations:,284,AIC:,1540.0
Df Residuals:,244,BIC:,1686.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,12.9915,4.063,3.197,0.002,4.988,20.995
school,0.2397,0.785,0.305,0.760,-1.306,1.785
sex,0.8885,0.508,1.748,0.082,-0.112,1.889
age,-0.0696,0.218,-0.320,0.749,-0.498,0.359
address,-0.7540,0.573,-1.315,0.190,-1.883,0.375
famsize,0.4639,0.518,0.895,0.372,-0.557,1.485
Pstatus,0.5952,0.766,0.777,0.438,-0.913,2.103
Medu,0.3221,0.332,0.971,0.332,-0.331,0.975
Fedu,-0.1232,0.278,-0.443,0.658,-0.671,0.425

0,1,2,3
Omnibus:,12.83,Durbin-Watson:,1.773
Prob(Omnibus):,0.002,Jarque-Bera (JB):,14.472
Skew:,-0.426,Prob(JB):,0.00072
Kurtosis:,3.705,Cond. No.,406.0


Al realizar un modelo de regresion lineal con todas las variables y G2 como vector objetivo se
obtuvo lo siguiente:

- El modelo presenta un $R^2$ = 0.301. Por lo tanto, las variables explican un 30.1% de la variacion
de ```G2```
- Solo el intercepto, ```failures```, ```famsup```, ```goout``` y ```health``` afectan significativamente a la variable ```G2```
- Obtuvo un Log-Likelihood de -730

Por lo tanto, al comparar este modelo con el que predice G1, el modelo para predecir G1 es mejor.

# Modelo para G3

In [10]:
modelo_g3 = aux3.crear_modelo_lineal(df.drop(columns=['G1', 'G2']), 'G3')
modelo_g3.summary()

0,1,2,3
Dep. Variable:,G3,R-squared:,0.312
Model:,OLS,Adj. R-squared:,0.201
Method:,Least Squares,F-statistic:,2.831
Date:,"Sat, 04 Feb 2023",Prob (F-statistic):,6.14e-07
Time:,23:21:29,Log-Likelihood:,-780.76
No. Observations:,284,AIC:,1642.0
Df Residuals:,244,BIC:,1787.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,12.3799,4.859,2.548,0.011,2.809,21.951
school,0.9206,0.938,0.981,0.327,-0.927,2.769
sex,1.0108,0.608,1.663,0.098,-0.186,2.208
age,-0.1608,0.260,-0.618,0.537,-0.673,0.352
address,-0.8873,0.686,-1.294,0.197,-2.238,0.463
famsize,0.3597,0.620,0.580,0.562,-0.861,1.580
Pstatus,1.0103,0.916,1.103,0.271,-0.793,2.814
Medu,0.5017,0.397,1.265,0.207,-0.279,1.283
Fedu,-0.3060,0.333,-0.920,0.358,-0.961,0.349

0,1,2,3
Omnibus:,12.84,Durbin-Watson:,1.821
Prob(Omnibus):,0.002,Jarque-Bera (JB):,13.547
Skew:,-0.533,Prob(JB):,0.00114
Kurtosis:,3.085,Cond. No.,406.0


Al realizar un modelo de regresion lineal con todas las variables y G3 como vector objetivo se
obtuvo lo siguiente:

- El modelo presenta un $R^2$ = 0.312. Por lo tanto, las variables explican un 31.2% de la variacion
de ```G3```
- Solo el intercepto, ```failures```, ```goout```, ```Walc``` y ```health``` afectan significativamente a la variable ```G3```
- Obtuvo un Log-Likelihood de -780

Por lo tanto, al comparar este modelo con los previos se obtienen las siguientes conclusiones:

- El modelo para predecir G3 es el que obtuvo el peor valor de Log-Likelihood (-780)
- El modelo para predecir G3 obtuvo un poder explicativo ($R^2$) mejor que el modelo para predecir G2, pero peor para predecir G1
- Solamente se utilizaran las variables ```failures```, ```goout```, ```Walc``` y ```health``` para el modelamiento predictivo