In [1]:
import warnings

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score

import factor_analyzer as fact

import funciones_auxiliares_hito_3 as aux3
from funciones_auxiliares_hito_2 import preprocesar_y_recodificar_enunciado_dos

# warnings.filterwarnings('ignore')
sns.set_theme()
plt.rcParams['figure.figsize'] = (12, 6)

In [2]:
df = pd.read_csv('students.csv', delimiter='|').drop(columns='Unnamed: 0')
df = preprocesar_y_recodificar_enunciado_dos(df)

# Analisis de factores latentes en variables famrel:health

In [3]:
posibles_factores_latentes = df.loc[:, 'famrel':'health']
print(fact.calculate_bartlett_sphericity(posibles_factores_latentes))

(264.4978629665067, 6.03816710126852e-48)


Por lo tanto, existe al menos una correlacion entre dos variables que es distinto de 0

In [4]:
fact.calculate_kmo(posibles_factores_latentes)

(array([0.50173343, 0.57692329, 0.61683967, 0.58664172, 0.5554092 ,
        0.59265861]),
 0.5767240731164592)

Como el valor del test de Kaiser Meyer Olkins es menor a 0.7, entonces NO existe un factor latente
en nuestros datos.

# Modelo para G1

In [7]:
modelo_g1 = aux3.crear_modelo_lineal(df.drop(columns=['G2', 'G3']), 'G1')
modelo_g1.summary()

0,1,2,3
Dep. Variable:,G1,R-squared:,0.331
Model:,OLS,Adj. R-squared:,0.224
Method:,Least Squares,F-statistic:,3.09
Date:,"Sat, 04 Feb 2023",Prob (F-statistic):,5.75e-08
Time:,23:08:48,Log-Likelihood:,-683.63
No. Observations:,284,AIC:,1447.0
Df Residuals:,244,BIC:,1593.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.6531,3.452,2.797,0.006,2.855,16.452
school,-0.3886,0.666,-0.583,0.560,-1.701,0.924
sex,0.7631,0.432,1.768,0.078,-0.087,1.613
age,0.0772,0.185,0.418,0.677,-0.287,0.441
address,-0.0205,0.487,-0.042,0.966,-0.980,0.939
famsize,0.3125,0.440,0.710,0.478,-0.555,1.180
Pstatus,0.1872,0.650,0.288,0.774,-1.094,1.468
Medu,0.1809,0.282,0.642,0.521,-0.374,0.736
Fedu,0.1527,0.236,0.646,0.519,-0.313,0.618

0,1,2,3
Omnibus:,6.431,Durbin-Watson:,2.003
Prob(Omnibus):,0.04,Jarque-Bera (JB):,4.178
Skew:,0.128,Prob(JB):,0.124
Kurtosis:,2.464,Cond. No.,406.0


Al realizar un modelo de regresion lineal con todas las variables y G1 como vector objetivo se
obtuvo lo siguiente:

- El modelo presenta un $R^2$ = 0.331. Por lo tanto, las variables explican un 33.1% de la variacion
de ```G1```
- Solo el intercepto, ```studytime```, ```failures```, ```schoolsup```, ```famsup```, ```goout``` y ```health``` afectan significativamente a la variable ```G1``` 