# Age 


# Import Libraries, Dependencies and Dataset


In [7]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import kagglehub
import os

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error, mean_squared_log_error, explained_variance_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.stats import norm

from pytorch_tabnet.tab_model import TabNetRegressor
import torch

In [8]:
# Dataset download
path = kagglehub.dataset_download("imoore/age-dataset")

print("Path to dataset files:", path)
dataset_path = "/home/codespace/.cache/kagglehub/datasets/imoore/age-dataset/versions/1"
print("Files in dataset directory:", os.listdir(dataset_path))

file_path = os.path.join(dataset_path, "AgeDataset-V1.csv")  #
df = pd.read_csv(file_path)

print(df.head())
print(df.info())
print(df.describe())

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/imoore/age-dataset/versions/1
Files in dataset directory: ['AgeDataset-V1.csv']
     Id                     Name  \
0   Q23        George Washington   
1   Q42            Douglas Adams   
2   Q91          Abraham Lincoln   
3  Q254  Wolfgang Amadeus Mozart   
4  Q255     Ludwig van Beethoven   

                                 Short description Gender  \
0   1st president of the United States (1732–1799)   Male   
1                      English writer and humorist   Male   
2  16th president of the United States (1809-1865)   Male   
3        Austrian composer of the Classical period   Male   
4           German classical and romantic composer   Male   

                                             Country  Occupation  Birth year  \
0  United States of America; Kingdom of Great Bri...  Politician        1732   
1                                     United Kingdom      Artist        1952   
2                           Uni

# Preprocessing

In [13]:
# Initial Data Info
print(f"Initial dataset shape: {df.shape}")

# Step 1: Handle missing data
df['Gender'] = df['Gender'].fillna('Unknown')
df['Country'] = df['Country'].fillna('Unknown')
df['Occupation'] = df['Occupation'].fillna('Unknown')
df = df.dropna(subset=['Death year', 'Age of death'])
print(f"After dropping rows with missing targets: {df.shape}")

# Step 2: Drop 'Manner of death'
df = df.drop(columns=['Manner of death'], errors='ignore')
print(f"Dataset shape after dropping columns: {df.shape}")

# Step 3: Encode categorical variables (preserve as categorical for TabNet)
print("Encoding categorical variables...")
label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])  # Keep as categorical index
df['Occupation'] = label_encoder.fit_transform(df['Occupation'])  # Keep as categorical index

# Step 4: Define features and target
target = 'Age of death'
X = df.drop(columns=[target, 'Id', 'Name'])  # Drop ID/name early
y = df[target]

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Step 6: Process text features using TF-IDF (full dataset)
print("Processing text features...")
tfidf = TfidfVectorizer(max_features=50)  # Reduced features for efficiency
X_train_text = tfidf.fit_transform(X_train['Short description'].fillna(''))
X_test_text = tfidf.transform(X_test['Short description'].fillna(''))

# Convert to DataFrames and merge
text_features = [f"tfidf_{i}" for i in range(tfidf.max_features)]
X_train = pd.concat([
    X_train.reset_index(drop=True).drop(columns=['Short description']),
    pd.DataFrame(X_train_text.toarray(), columns=text_features)
], axis=1)

X_test = pd.concat([
    X_test.reset_index(drop=True).drop(columns=['Short description']),
    pd.DataFrame(X_test_text.toarray(), columns=text_features)
], axis=1)

# Step 7: One-hot encode gender
X_train = pd.get_dummies(X_train, columns=['Gender'], prefix='gender')
X_test = pd.get_dummies(X_test, columns=['Gender'], prefix='gender')

# Align columns between train/test
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Step 8: Scale numerical features
print("Scaling numerical features...")
numerical_cols = ['Birth year', 'Death year']
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Step 9: Prepare categorical indices for TabNet (BEFORE converting to numpy)
categorical_columns = ['Country', 'Occupation']
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_columns]
categorical_dims = {
    X_train.columns.get_loc(col): len(df[col].unique())
    for col in categorical_columns
}

# Step 10: Convert to numpy arrays (AFTER getting categorical indices)
X_train = X_train.astype(np.float32).values
X_test = X_test.astype(np.float32).values
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

print("\nFinal Preprocessed Shapes:")
print(f"Train features: {X_train.shape}, Test features: {X_test.shape}")
print(f"Categorical indices: {categorical_indices}")
print(f"Categorical dimensions: {categorical_dims}")


# }

Initial dataset shape: (1223008, 9)
After dropping rows with missing targets: (1223008, 9)
Dataset shape after dropping columns: (1223008, 9)
Encoding categorical variables...
Train shape: (978406, 6), Test shape: (244602, 6)
Processing text features...
Scaling numerical features...

Final Preprocessed Shapes:
Train features: (978406, 74), Test features: (244602, 74)
Categorical indices: [0, 1]
Categorical dimensions: {0: 5962, 1: 9314}


# TabNet Modeling


In [14]:
from pytorch_tabnet.tab_model import TabNetRegressor
import numpy as np
import torch

# Convert data to numpy arrays
y_train = y_train.astype(np.float32).reshape(-1, 1)
y_test = y_test.astype(np.float32).reshape(-1, 1)

# Configure TabNet with categorical information
tabnet_params = {
    "cat_idxs": categorical_indices,
    "cat_dims": [categorical_dims[idx] for idx in categorical_indices],
    "cat_emb_dim": 2,  # Embedding dimension for categorical features
    "n_d": 64,  # Dimension of prediction layer
    "n_a": 64,  # Dimension of attention layer
    "n_steps": 5,  # Number of sequential attention steps
    "gamma": 1.3,  # Coefficient for feature reusage
    "optimizer_fn": torch.optim.Adam,
    "optimizer_params": dict(lr=1e-3, weight_decay=1e-5),
    "scheduler_params": {"step_size": 50, "gamma": 0.9},
    "scheduler_fn": torch.optim.lr_scheduler.StepLR,
    "mask_type": "entmax",  # Sparsemax or entmax for feature selection
    "verbose": 1,
}

# Initialize model
tabnet_model = TabNetRegressor(**tabnet_params)

# Train with early stopping
tabnet_model.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=["train", "valid"],
    eval_metric=["rmse"],
    max_epochs=200,
    patience=30,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=4,
    drop_last=False,
)

# Plot training metrics
plt.figure(figsize=(10, 6))
plt.plot(tabnet_model.history["loss"], label="Training Loss")
plt.plot(tabnet_model.history["valid_rmse"], label="Validation RMSE")
plt.title("Training Metrics")
plt.xlabel("Epochs")
plt.ylabel("RMSE/Loss")
plt.legend()
plt.grid()
plt.show()

# Evaluate
y_pred = tabnet_model.predict(X_test).flatten()
print(f"Final RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")



epoch 0  | loss: 823.358 | train_rmse: 5.47972 | valid_rmse: 7.5928  |  0:03:32s




epoch 1  | loss: 8.02734 | train_rmse: 6.48606 | valid_rmse: 7.48127 |  0:07:03s




epoch 2  | loss: 6.23748 | train_rmse: 2.29409 | valid_rmse: 2.46952 |  0:10:34s




epoch 3  | loss: 5.33666 | train_rmse: 1.3974  | valid_rmse: 1.30842 |  0:14:06s




epoch 4  | loss: 5.25304 | train_rmse: 1.84584 | valid_rmse: 2.89405 |  0:17:38s




epoch 5  | loss: 4.61019 | train_rmse: 1.59584 | valid_rmse: 1.60098 |  0:21:10s




epoch 6  | loss: 4.50908 | train_rmse: 1.49855 | valid_rmse: 1.30915 |  0:24:42s




epoch 7  | loss: 3.71876 | train_rmse: 2.30081 | valid_rmse: 2.20696 |  0:28:13s




epoch 8  | loss: 3.5821  | train_rmse: 4.97114 | valid_rmse: 1.78255 |  0:31:44s




epoch 9  | loss: 3.68711 | train_rmse: 2.48634 | valid_rmse: 2.48701 |  0:35:15s




epoch 10 | loss: 3.83364 | train_rmse: 1.22598 | valid_rmse: 1.24476 |  0:38:47s




epoch 11 | loss: 3.09975 | train_rmse: 1.19524 | valid_rmse: 1.203   |  0:42:19s




epoch 12 | loss: 3.46007 | train_rmse: 1.67109 | valid_rmse: 1.73886 |  0:45:50s




epoch 13 | loss: 3.38587 | train_rmse: 1.41498 | valid_rmse: 1.40668 |  0:49:20s




epoch 14 | loss: 3.25061 | train_rmse: 1.23776 | valid_rmse: 1.24307 |  0:52:49s




epoch 15 | loss: 2.95305 | train_rmse: 1.83977 | valid_rmse: 1.84581 |  0:56:18s




epoch 16 | loss: 3.06855 | train_rmse: 1.63279 | valid_rmse: 1.63574 |  0:59:49s




epoch 17 | loss: 3.29965 | train_rmse: 1.87758 | valid_rmse: 1.88109 |  1:03:20s




epoch 18 | loss: 2.69956 | train_rmse: 2.02468 | valid_rmse: 2.03353 |  1:06:47s




epoch 19 | loss: 2.99091 | train_rmse: 1.79969 | valid_rmse: 1.80557 |  1:10:14s




epoch 20 | loss: 3.02602 | train_rmse: 0.99624 | valid_rmse: 0.99847 |  1:13:42s




epoch 21 | loss: 3.09827 | train_rmse: 1.21558 | valid_rmse: 1.21628 |  1:17:08s




epoch 22 | loss: 2.81106 | train_rmse: 1.29716 | valid_rmse: 1.29984 |  1:20:36s




epoch 23 | loss: 2.76539 | train_rmse: 1.87378 | valid_rmse: 1.87847 |  1:24:05s




epoch 24 | loss: 2.63502 | train_rmse: 1.41589 | valid_rmse: 1.40815 |  1:27:33s




epoch 25 | loss: 2.74501 | train_rmse: 1.73883 | valid_rmse: 1.74407 |  1:31:02s




epoch 26 | loss: 2.72864 | train_rmse: 1.18718 | valid_rmse: 1.18847 |  1:34:30s




epoch 27 | loss: 2.63563 | train_rmse: 1.3942  | valid_rmse: 1.39788 |  1:37:59s




epoch 28 | loss: 2.63149 | train_rmse: 1.40567 | valid_rmse: 1.40152 |  1:41:26s




epoch 29 | loss: 2.63758 | train_rmse: 0.71731 | valid_rmse: 0.53291 |  1:44:53s




epoch 30 | loss: 2.586   | train_rmse: 1.29917 | valid_rmse: 1.28483 |  1:48:18s




epoch 31 | loss: 2.47828 | train_rmse: 1.83005 | valid_rmse: 1.83438 |  1:51:44s




epoch 32 | loss: 2.45024 | train_rmse: 1.05599 | valid_rmse: 1.05825 |  1:55:09s




epoch 33 | loss: 2.58466 | train_rmse: 1.61277 | valid_rmse: 1.61919 |  1:58:36s




epoch 34 | loss: 2.35191 | train_rmse: 1.59329 | valid_rmse: 1.59556 |  2:02:02s




epoch 35 | loss: 2.328   | train_rmse: 1.26145 | valid_rmse: 1.26705 |  2:05:29s




epoch 36 | loss: 2.40529 | train_rmse: 1.15634 | valid_rmse: 1.14169 |  2:08:57s




epoch 37 | loss: 2.4088  | train_rmse: 2.30682 | valid_rmse: 2.21325 |  2:12:25s




epoch 38 | loss: 2.31534 | train_rmse: 0.89964 | valid_rmse: 0.89301 |  2:15:53s




epoch 39 | loss: 2.35654 | train_rmse: 3.087   | valid_rmse: 3.08933 |  2:19:22s




epoch 40 | loss: 2.47357 | train_rmse: 1.09651 | valid_rmse: 1.08654 |  2:22:50s




epoch 41 | loss: 2.37866 | train_rmse: 2.33682 | valid_rmse: 2.33707 |  2:26:18s




epoch 42 | loss: 2.34451 | train_rmse: 1.39505 | valid_rmse: 1.39803 |  2:29:44s




epoch 43 | loss: 2.13925 | train_rmse: 0.65197 | valid_rmse: 0.62174 |  2:33:09s




epoch 44 | loss: 2.20365 | train_rmse: 1.222   | valid_rmse: 1.21902 |  2:36:36s




epoch 45 | loss: 2.13448 | train_rmse: 1.55065 | valid_rmse: 1.55191 |  2:40:03s




epoch 46 | loss: 2.12113 | train_rmse: 1.11716 | valid_rmse: 1.12107 |  2:43:31s




epoch 47 | loss: 2.15669 | train_rmse: 1.38936 | valid_rmse: 1.39228 |  2:46:58s




epoch 48 | loss: 2.09959 | train_rmse: 1.48104 | valid_rmse: 1.48366 |  2:50:27s




epoch 49 | loss: 2.11441 | train_rmse: 1.0421  | valid_rmse: 1.04085 |  2:53:55s




epoch 50 | loss: 1.98886 | train_rmse: 1.29563 | valid_rmse: 1.29853 |  2:57:22s




epoch 51 | loss: 2.03786 | train_rmse: 1.38182 | valid_rmse: 1.38093 |  3:00:49s




epoch 52 | loss: 2.10151 | train_rmse: 1.07441 | valid_rmse: 1.07416 |  3:04:17s




epoch 53 | loss: 2.05192 | train_rmse: 1.38725 | valid_rmse: 1.38722 |  3:07:44s




epoch 54 | loss: 2.08943 | train_rmse: 1.00761 | valid_rmse: 1.00768 |  3:11:10s




epoch 55 | loss: 2.04062 | train_rmse: 1.21882 | valid_rmse: 1.22344 |  3:14:37s




epoch 56 | loss: 2.02673 | train_rmse: 1.11314 | valid_rmse: 1.11671 |  3:18:04s




epoch 57 | loss: 2.12197 | train_rmse: 0.74601 | valid_rmse: 0.74409 |  3:21:31s




epoch 58 | loss: 1.88752 | train_rmse: 0.71059 | valid_rmse: 0.71072 |  3:24:56s




epoch 59 | loss: 2.05889 | train_rmse: 1.73666 | valid_rmse: 1.74057 |  3:28:23s

Early stopping occurred at epoch 59 with best_epoch = 29 and best_valid_rmse = 0.53291




: 

epoch 59 | loss: 2.05889 | train_rmse: 1.73666 | valid_rmse: 1.74057 |  3:28:23s

Early stopping occurred at epoch 59 with best_epoch = 29 and best_valid_rmse = 0.53291