<a href="https://colab.research.google.com/github/LCaravaggio/AnalisisPredictivo/blob/master/Kaggle/2025Q1/TabNet_para_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np


from google.colab import userdata
import json

!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {
    'username': userdata.get('KAGGLE_USER'),
    'key': userdata.get('KAGGLE_KEY')}
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c analisis-predictivo-2025-q-1


import zipfile
import os

os.listdir()

for file in os.listdir():
    if file.endswith('.zip'):
      zip_ref = zipfile.ZipFile(file, 'r')
      zip_ref.extractall()
      zip_ref.close()

Downloading analisis-predictivo-2025-q-1.zip to /content
  0% 0.00/13.6M [00:00<?, ?B/s]
100% 13.6M/13.6M [00:00<00:00, 451MB/s]


In [2]:
# Seleccionar columnas que NO son numéricas
df = pd.read_csv('train.csv')
non_numeric_cols = df.select_dtypes(exclude=['number']).columns

print("Columnas no numéricas:")
print(non_numeric_cols)

Columnas no numéricas:
Index(['well_name', 'location', 'technology_level'], dtype='object')


In [3]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# Cargar los datos
df = pd.read_csv('train.csv')

# Columnas categóricas a codificar
cat_cols = ['location', 'technology_level']

# Codificar esas columnas
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df[cat_cols] = encoder.fit_transform(df[cat_cols])

# Seleccionar solo columnas numéricas
df = df.select_dtypes(include=['number'])

# Separar X (features) e y (target)
X = df.drop(columns='production_rate')
y = df['production_rate']

# Hacer el split en 80% entrenamiento y 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 

In [8]:
from pytorch_tabnet.tab_model import TabNetRegressor
import torch

reg = TabNetRegressor(
    n_d=24,
    n_a=24,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-5,
    optimizer_params=dict(lr=0.01),
    scheduler_params={"step_size":10, "gamma":0.95},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax',
    seed=42
)


reg.fit(
    X_train.values, y_train.values.reshape(-1, 1),
    eval_set=[(X_test.values, y_test.values.reshape(-1, 1))],
    max_epochs=200,
    patience=20,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)



epoch 0  | loss: 546034852702.0505| val_0_mse: 547979680700.9423|  0:00:04s
epoch 1  | loss: 545855236230.8529| val_0_mse: 547749766467.3774|  0:00:08s
epoch 2  | loss: 545389651850.7914| val_0_mse: 547097213897.47217|  0:00:17s
epoch 3  | loss: 544611587023.47815| val_0_mse: 545985505758.61816|  0:00:24s
epoch 4  | loss: 543432911174.4197| val_0_mse: 544500031914.78125|  0:00:28s
epoch 5  | loss: 541946216770.6388| val_0_mse: 542903187628.226|  0:00:31s
epoch 6  | loss: 540177153896.76306| val_0_mse: 540928467736.7252|  0:00:35s
epoch 7  | loss: 538146548320.72864| val_0_mse: 538524954691.95776|  0:00:39s
epoch 8  | loss: 535973810788.5095| val_0_mse: 536698600035.6448|  0:00:42s
epoch 9  | loss: 533670438210.00867| val_0_mse: 534535577302.22943|  0:00:46s
epoch 10 | loss: 531138487779.328| val_0_mse: 532429261000.2684|  0:00:50s
epoch 11 | loss: 528543473548.0517| val_0_mse: 529961351073.91925|  0:00:53s
epoch 12 | loss: 525838632394.1218| val_0_mse: 526608622251.2372|  0:00:57s
epoc



In [13]:
from sklearn.metrics import r2_score
print("TabNet R²:", r2_score(y_test, reg.predict(X_test.values)))

TabNet R²: 0.7805793428622727


In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Cargar validación
val = pd.read_csv('validation.csv')

# Columnas categóricas a codificar
cat_cols = ['location', 'technology_level']

# Encoding (uso el que se entrenó en train)
val[cat_cols] = encoder.transform(val[cat_cols])

# Seleccionar solo columnas numéricas
val_numeric = val.select_dtypes(include=['number'])

# Realizar predicciones
preds = reg.predict(val_numeric.values).flatten()  # Asegurar 1D si devuelve array 2D

# Crear DataFrame final
submission = pd.DataFrame({
    'id': np.arange(65001, 80001),
    'production_rate': preds
})

In [17]:
submission.to_csv('submission.csv')