<a href="https://colab.research.google.com/github/JhoanVillaNO/proyectoKaggle/blob/main/04%20-%20modelo_con_RFC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalación de dependencias

In [2]:
%pip install pandas numpy scikit-learn matplotlib



## Carga de librerías y archivos

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

In [4]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.40GB/s]


In [5]:
!unzip udea*.zip > /dev/null

In [6]:
import pandas as pd
train = pd.read_csv("train.csv")
train.info()
train.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692500 entries, 0 to 692499
Data columns (total 21 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   ID                           692500 non-null  int64  
 1   PERIODO_ACADEMICO            692500 non-null  int64  
 2   E_PRGM_ACADEMICO             692500 non-null  object 
 3   E_PRGM_DEPARTAMENTO          692500 non-null  object 
 4   E_VALORMATRICULAUNIVERSIDAD  686213 non-null  object 
 5   E_HORASSEMANATRABAJA         661643 non-null  object 
 6   F_ESTRATOVIVIENDA            660363 non-null  object 
 7   F_TIENEINTERNET              665871 non-null  object 
 8   F_EDUCACIONPADRE             669322 non-null  object 
 9   F_TIENELAVADORA              652727 non-null  object 
 10  F_TIENEAUTOMOVIL             648877 non-null  object 
 11  E_PRIVADO_LIBERTAD           692500 non-null  object 
 12  E_PAGOMATRICULAPROPIO        686002 non-null  object 
 13 

(692500, 21)

## Selección de variables y división de datos

In [7]:
categorical_features = [
    "E_VALORMATRICULAUNIVERSIDAD",
    "E_HORASSEMANATRABAJA",
    "F_ESTRATOVIVIENDA",
    "F_EDUCACIONPADRE",
    "F_EDUCACIONMADRE",
    "E_PRGM_DEPARTAMENTO",
    "F_TIENEINTERNET.1",
    "E_PAGOMATRICULAPROPIO",
    "E_PRGM_ACADEMICO"
]

# Asegurar que las variables categóricas sean string y sin nulos
temp = train.copy()
for col in categorical_features:
    if col in temp.columns:
        temp[col] = temp[col].astype(str).fillna('missing')

# Codificar el target a valores numéricos
le = LabelEncoder()
y = le.fit_transform(temp["RENDIMIENTO_GLOBAL"])
X = temp.drop(columns=["RENDIMIENTO_GLOBAL", "ID", "PERIODO_ACADEMICO"] )
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp)

## Preprocesamiento para RandomForestClassifier

In [8]:
numeric_features = [col for col in X_train.select_dtypes(include=[np.number]).columns if col not in categorical_features]

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("scaler", StandardScaler())
    ]), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_features)
 ])

## Pipeline y entrenamiento con RandomForestClassifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=2000, max_depth=8, max_features='sqrt', random_state=42, n_jobs=-1, verbose=1))
]) #Usamos n_jobs = -1 para usar todos los cores disponibles en nuestro PC

pipeline.fit(X_train, y_train)

y_val_pred = pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy validación: {val_accuracy:.4f}")
print(classification_report(y_val, y_val_pred, target_names=le.classes_))

y_test_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy test: {test_accuracy:.4f}")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  8.4min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    2.7s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    5.2s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:    9.2s
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed:   12.5s
[Parallel(n_jobs=2)]: Do

Accuracy validación: 0.3864
              precision    recall  f1-score   support

        alto       0.45      0.64      0.53     30733
        bajo       0.38      0.60      0.47     30272
  medio-alto       0.30      0.14      0.19     30034
  medio-bajo       0.30      0.17      0.22     30148

    accuracy                           0.39    121187
   macro avg       0.36      0.38      0.35    121187
weighted avg       0.36      0.39      0.35    121187



[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed:    1.9s


Accuracy test: 0.3872
              precision    recall  f1-score   support

        alto       0.45      0.64      0.53      4391
        bajo       0.38      0.61      0.47      4325
  medio-alto       0.29      0.13      0.18      4290
  medio-bajo       0.31      0.17      0.22      4307

    accuracy                           0.39     17313
   macro avg       0.36      0.39      0.35     17313
weighted avg       0.36      0.39      0.35     17313



[Parallel(n_jobs=2)]: Done 2000 out of 2000 | elapsed:    2.2s finished


## Kaggle Submission

In [10]:
test_data = pd.read_csv("test.csv")

In [11]:
for col in categorical_features:
    if col in test_data.columns:
        test_data[col] = test_data[col].astype(str).fillna('missing')
X_test_kaggle = test_data.drop(columns=["ID", "PERIODO_ACADEMICO"], errors='ignore')
predictions = pipeline.predict(X_test_kaggle)
predictions_labels = le.inverse_transform(predictions)
submission_df = test_data[["ID"]].copy()
submission_df["RENDIMIENTO_GLOBAL"] = predictions_labels
submission_df.to_csv("submission_rf.csv", index=False)
print("Archivo de submission generado: submission_rf.csv")
print(submission_df.head())

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    2.9s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    7.7s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:   13.2s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:   20.5s
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed:   29.3s
[Parallel(n_jobs=2)]: Done 2000 out of 2000 | elapsed:   32.3s finished


Archivo de submission generado: submission_rf.csv
       ID RENDIMIENTO_GLOBAL
0  550236         medio-alto
1   98545         medio-alto
2  499179               alto
3  782980               bajo
4  785185         medio-bajo
