In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
# Cargar los datos
cleaned_data_path = 'datasets/cleaned_data.csv'
data_all = pd.read_csv(cleaned_data_path)

# Convertir la columna a números, los valores inválidos se convierten en NaN
data_all['EDAD_ALU'] = pd.to_numeric(data_all['EDAD_ALU'], errors='coerce')

# Eliminar las filas con NaN en la columna 'EDAD_ALU'
data_all = data_all.dropna(subset=['EDAD_ALU'])

# Borrar las filas donde la columna 'COD_SEC' sea distinto de 0
data_all = data_all[data_all['COD_SEC'] == 0]  # Solo alumnos de basica y media

# Eliminar la columna 'COD_ENSE'
data_all = data_all.drop(columns=['COD_ENSE'])

# Observar correlacion para hacer mas limpieza
numeric_columns = data_all.select_dtypes(include=np.number).columns
correlacion_all = data_all[numeric_columns].corr()

# Ver correlacion en base a valor a evaluar
correlacion_prom_gral = correlacion_all["PROM_GRAL"].sort_values(
    ascending=False)

# Obtén los nombres de las columnas con correlación mayor a 0.1
columnas_mayor_0_1 = correlacion_prom_gral[correlacion_prom_gral > 0.057].index

# Obtén los nombres de las columnas con correlación menor a -0.1
columnas_menor_neg_0_1 = correlacion_prom_gral[correlacion_prom_gral < -0.05].index

columnas_seleccionadas = columnas_mayor_0_1.tolist() + \
    columnas_menor_neg_0_1.tolist()

# Agregar comuna del colegio
columnas_seleccionadas.append('COD_COM_RBD')
# columnas_seleccionadas.append('DGV_RBD')

# Hacer data = data_all con las columnas a usar EDAD_ALU, GEN_ALU, PROM_GRAL
data_clear = data_all[columnas_seleccionadas]
data = data_clear.drop('FEC_NAC_ALU', axis=1)
data = data.drop('COD_GRADO', axis=1)

  data_all = pd.read_csv(cleaned_data_path)


In [55]:

# División de datos
X = data.drop(columns=['PROM_GRAL'])
y = data['PROM_GRAL']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [26]:
# Escalar los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
data.head()

Unnamed: 0,PROM_GRAL,ASISTENCIA,FEC_NAC_ALU,GEN_ALU,COD_DEPE,COD_GRADO,COD_JOR,COD_ENSE2,EDAD_ALU,COD_COM_RBD
7,4.5,85,200804,1,6,1,1,7,14.0,15101
8,5.1,89,200704,1,6,1,1,7,15.0,15101
9,5.3,89,200510,1,6,1,1,7,16.0,15101
10,5.9,90,200604,2,6,1,1,7,16.0,15101
11,5.7,90,200801,1,6,1,1,7,14.0,15101


In [33]:
# probar modelo guardado
# libreria para cargar modelo
from keras.models import load_model
# cargar modelo que esta en una carpeta con assets, variables y modelo
model = load_model('resultados/modelo.h5')


In [54]:
# predicciones
y_pred = model.predict(X_test)



In [56]:
# aproximar valores a 1 decimal
y_pred = np.around(y_pred, decimals=1)

In [39]:
y_pred.shape

(585808, 1)

In [52]:
# mostrar resultados
print("Predicciones:")
print(y_pred[:10])
print("Valores reales:")
print(y_test[:10])

Predicciones:
[[6.]
 [6.]
 [7.]
 [6.]
 [6.]
 [7.]
 [6.]
 [7.]
 [6.]
 [6.]]
Valores reales:
2937053    7.0
510731     7.0
2554909    6.0
1974179    6.0
268631     6.0
1821374    7.0
2834077    6.0
3038612    7.0
2192887    5.0
2207270    5.0
Name: PROM_GRAL, dtype: float64


In [57]:
# mse, mae, rmse, r2
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE: ", mse)
print("MAE: ", mae)
print("RMSE: ", rmse)
print("R2: ", r2)

MSE:  0.3134896749494354
MAE:  0.4156413110730229
RMSE:  0.5599014868255267
R2:  0.3363914933873664


In [60]:
from sklearn.metrics import r2_score

# Valores reales de y
y_test = [3, -0.5, 2, 7]

# Valores de y predichos por el modelo
y_pred = [2.5, 3.0, 2, 8]

# Calcular el valor de R2
r2 = r2_score(y_test, y_pred)

print("R2: ", r2)

R2:  0.537473233404711
