# 01.01 - PROYECTO KAGGLE

In [None]:
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/ai4eng.v1/main/content/init.py
import init; init.init(force_download=False); init.get_weblink()

replicating local resources


## download data directly from Kaggle

- create a file `kaggle.json` with your authentication token (in kaggle $\to$ click user icon on top-right $\to$ settings $\to$ API create new token)
- upload it to this notebook workspace
- run the following cell

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json
!kaggle competitions download -c udea-ai4eng-20242

Downloading udea-ai4eng-20242.zip to /content
  0% 0.00/20.1M [00:00<?, ?B/s] 75% 15.0M/20.1M [00:00<00:00, 155MB/s]
100% 20.1M/20.1M [00:00<00:00, 175MB/s]


## unzip y exploración

In [None]:
!unzip udea*.zip > /dev/null

In [None]:
!wc *.csv

   296787    296787   4716673 submission_example.csv
   296787   4565553  50135751 test.csv
   692501  10666231 118025055 train.csv
  1286075  15528571 172877479 total


## load `train.csv`


In [None]:
# Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder,StandardScaler,MinMaxScaler
from sklearn import svm
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import time
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [None]:
enrollment_mapping = {
    "No pagó matrícula": 0.0,
    "Menos de 500 mil": 0.25,
    "Entre 500 mil y menos de 1 millón": 0.75,
    "Entre 1 millón y menos de 2.5 millones": 1.75,
    "Entre 2.5 millones y menos de 4 millones": 3.25,
    "Entre 4 millones y menos de 5.5 millones": 4.75,
    "Entre 5.5 millones y menos de 7 millones": 6.25,
    "Más de 7 millones": 7.5,
}
train_data['ESTU_VALORMATRICULAUNIVERSIDAD'] = train_data['ESTU_VALORMATRICULAUNIVERSIDAD'].map(enrollment_mapping)
test_data['ESTU_VALORMATRICULAUNIVERSIDAD'] = test_data['ESTU_VALORMATRICULAUNIVERSIDAD'].map(enrollment_mapping)

workHours_mapping = {'Entre 21 y 30 horas': 25,
 'Más de 30 horas': 35,
 'Menos de 10 horas': 5,
 'Entre 11 y 20 horas': 15,
 '0': 0}
train_data['ESTU_HORASSEMANATRABAJA'] = train_data['ESTU_HORASSEMANATRABAJA'].map(workHours_mapping)
test_data['ESTU_HORASSEMANATRABAJA'] = test_data['ESTU_HORASSEMANATRABAJA'].map(workHours_mapping)

social_status_mapping = {'Estrato 1': 1,
 'Estrato 2': 2,
 'Estrato 3': 3,
 'Estrato 4': 4,
 'Estrato 5': 5,
 'Estrato 6': 6,
 'Sin Estrato': 0,}
train_data['FAMI_ESTRATOVIVIENDA'] = train_data['FAMI_ESTRATOVIVIENDA'].map(social_status_mapping)
test_data['FAMI_ESTRATOVIVIENDA'] = test_data['FAMI_ESTRATOVIVIENDA'].map(social_status_mapping)

binary_map = {"Si": 1, "No": 0}
train_data['FAMI_TIENEINTERNET'] = train_data['FAMI_TIENEINTERNET'].map(binary_map)
test_data['FAMI_TIENEINTERNET'] = test_data['FAMI_TIENEINTERNET'].map(binary_map)
train_data['ESTU_PAGOMATRICULAPROPIO'] = train_data['ESTU_PAGOMATRICULAPROPIO'].map(binary_map)
test_data['ESTU_PAGOMATRICULAPROPIO'] = test_data['ESTU_PAGOMATRICULAPROPIO'].map(binary_map)

target = {'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3}
train_data['RENDIMIENTO_GLOBAL'] = train_data['RENDIMIENTO_GLOBAL'].map(target)

In [None]:
X_train = train_data.drop(columns=['ID', 'RENDIMIENTO_GLOBAL'])
y_train = train_data['RENDIMIENTO_GLOBAL']
X_test = test_data.drop(columns=['ID'])

In [None]:
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Cambiar el modelo
model = XGBClassifier(random_state=24, n_estimators=100, learning_rate=0.1, max_depth=6)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
cv_score = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')

print(f"Cross-validation accuracy: {cv_score.mean():.4f}")

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)


Cross-validation accuracy: 0.4209


In [None]:
target_inverse_mapping = {0: 'bajo', 1: 'medio-bajo', 2: 'medio-alto', 3: 'alto'}
predictions_labels = [target_inverse_mapping[pred] for pred in predictions]

In [None]:
submission_df = pd.DataFrame({
    "ID": test_data['ID'],
    "RENDIMIENTO_GLOBAL": predictions_labels
})
submission_df.to_csv("my_submission.csv", index=False)

In [None]:
!kaggle competitions submit -c udea-ai4eng-20242 -f my_submission.csv -m "third try"

  0% 0.00/4.05M [00:00<?, ?B/s]100% 4.05M/4.05M [00:00<00:00, 16.8MB/s]
Successfully submitted to UDEA/ai4eng 20242 - Pruebas Saber Pro Colombia