In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [12]:
# Cargar los datos
cleaned_data_path = 'datasets/cleaned_data.csv'
data_all = pd.read_csv(cleaned_data_path)

# Convertir la columna a números, los valores inválidos se convierten en NaN
data_all['EDAD_ALU'] = pd.to_numeric(data_all['EDAD_ALU'], errors='coerce')

# Eliminar las filas con NaN en la columna 'EDAD_ALU'
data_all = data_all.dropna(subset=['EDAD_ALU'])

# Borrar las filas donde la columna 'COD_SEC' sea distinto de 0
data_all = data_all[data_all['COD_SEC'] == 0]  # Solo alumnos de basica y media

# Eliminar la columna 'COD_ENSE'
data_all = data_all.drop(columns=['COD_ENSE'])

# Observar correlacion para hacer mas limpieza
numeric_columns = data_all.select_dtypes(include=np.number).columns
correlacion_all = data_all[numeric_columns].corr()

# Ver correlacion en base a valor a evaluar
correlacion_prom_gral = correlacion_all["PROM_GRAL"].sort_values(
    ascending=False)

# Obtén los nombres de las columnas con correlación mayor a 0.1
columnas_mayor_0_1 = correlacion_prom_gral[correlacion_prom_gral > 0.057].index

# Obtén los nombres de las columnas con correlación menor a -0.1
columnas_menor_neg_0_1 = correlacion_prom_gral[correlacion_prom_gral < -0.05].index

columnas_seleccionadas = columnas_mayor_0_1.tolist() + \
    columnas_menor_neg_0_1.tolist()

# Agregar comuna del colegio
columnas_seleccionadas.append('COD_COM_RBD')
# columnas_seleccionadas.append('DGV_RBD')

# Hacer data = data_all con las columnas a usar EDAD_ALU, GEN_ALU, PROM_GRAL
data_clear = data_all[columnas_seleccionadas]
data = data_clear.drop('FEC_NAC_ALU', axis=1)
data = data.drop('COD_GRADO', axis=1)

  data_all = pd.read_csv(cleaned_data_path)


In [21]:
data.head()

Unnamed: 0,PROM_GRAL,ASISTENCIA,GEN_ALU,COD_DEPE,COD_JOR,COD_ENSE2,EDAD_ALU,COD_COM_RBD
7,4.5,85,1,6,1,7,14.0,15101
8,5.1,89,1,6,1,7,15.0,15101
9,5.3,89,1,6,1,7,16.0,15101
10,5.9,90,2,6,1,7,16.0,15101
11,5.7,90,1,6,1,7,14.0,15101


In [22]:
# one hot encoding cod_ense2
data = pd.get_dummies(data, columns=['COD_ENSE2'])
data = data.drop('COD_ENSE2', axis=1)

# one hot encoding cod_depe
data = pd.get_dummies(data, columns=['COD_DEPE'])
data = data.drop('COD_DEPE', axis=1)

# one hot encoding cod_jor
data = pd.get_dummies(data, columns=['COD_JOR'])
data = data.drop('COD_JOR', axis=1)

data

Unnamed: 0,PROM_GRAL,ASISTENCIA,GEN_ALU,COD_DEPE,COD_JOR,EDAD_ALU,COD_COM_RBD,COD_ENSE2_2,COD_ENSE2_3,COD_ENSE2_5,COD_ENSE2_6,COD_ENSE2_7
7,4.5,85,1,6,1,14.0,15101,False,False,False,False,True
8,5.1,89,1,6,1,15.0,15101,False,False,False,False,True
9,5.3,89,1,6,1,16.0,15101,False,False,False,False,True
10,5.9,90,2,6,1,16.0,15101,False,False,False,False,True
11,5.7,90,1,6,1,14.0,15101,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
3092858,6.0,85,1,3,1,16.0,6310,False,False,True,False,False
3092859,6.6,90,1,3,1,21.0,6310,False,False,True,False,False
3092860,6.9,90,2,3,1,17.0,6310,False,False,True,False,False
3092861,6.4,90,2,3,1,18.0,6310,False,False,True,False,False


In [23]:

# División de datos
X = data.drop(columns=['PROM_GRAL'])
y = data['PROM_GRAL']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [24]:
# Escalar los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
data.head()

Unnamed: 0,PROM_GRAL,ASISTENCIA,GEN_ALU,COD_DEPE,COD_JOR,EDAD_ALU,COD_COM_RBD,COD_ENSE2_2,COD_ENSE2_3,COD_ENSE2_5,COD_ENSE2_6,COD_ENSE2_7
7,4.5,85,1,6,1,14.0,15101,False,False,False,False,True
8,5.1,89,1,6,1,15.0,15101,False,False,False,False,True
9,5.3,89,1,6,1,16.0,15101,False,False,False,False,True
10,5.9,90,2,6,1,16.0,15101,False,False,False,False,True
11,5.7,90,1,6,1,14.0,15101,False,False,False,False,True


In [26]:
# probar modelo guardado
# libreria para cargar modelo
from keras.models import load_model
# cargar modelo que esta en una carpeta con assets, variables y modelo
model = load_model('resultados/modelo.h5')


In [27]:
#importar prediccion

# predicciones
y_pred = model.predict(X_test)

ValueError: in user code:

    File "c:\Python311\Lib\site-packages\keras\engine\training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File "c:\Python311\Lib\site-packages\keras\engine\training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Python311\Lib\site-packages\keras\engine\training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Python311\Lib\site-packages\keras\engine\training.py", line 2111, in predict_step
        return self(x, training=False)
    File "c:\Python311\Lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Python311\Lib\site-packages\keras\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_7" is incompatible with the layer: expected shape=(None, 9), found shape=(None, 11)


In [None]:
# aproximar valores a 1 decimal
y_pred = np.around(y_pred, decimals=1)

In [None]:
y_pred.shape

(585808, 1)

In [None]:
# mostrar resultados
print("Predicciones:")
print(y_pred[:10])
print("Valores reales:")
print(y_test[:10])

Predicciones:
[[6.]
 [6.]
 [7.]
 [6.]
 [6.]
 [7.]
 [6.]
 [7.]
 [6.]
 [6.]]
Valores reales:
2937053    7.0
510731     7.0
2554909    6.0
1974179    6.0
268631     6.0
1821374    7.0
2834077    6.0
3038612    7.0
2192887    5.0
2207270    5.0
Name: PROM_GRAL, dtype: float64


In [None]:
# mse, mae, rmse, r2
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE: ", mse)
print("MAE: ", mae)
print("RMSE: ", rmse)
print("R2: ", r2)

MSE:  0.3134896749494354
MAE:  0.4156413110730229
RMSE:  0.5599014868255267
R2:  0.3363914933873664


In [None]:
from sklearn.metrics import r2_score

# Valores reales de y
y_test = [3, -0.5, 2, 7]

# Valores de y predichos por el modelo
y_pred = [2.5, 3.0, 2, 8]

# Calcular el valor de R2
r2 = r2_score(y_test, y_pred)

print("R2: ", r2)

R2:  0.537473233404711
