In [None]:
import numpy as np #type: ignore
import pandas as pd #type: ignore
import joblib #type: ignore

from sklearn.preprocessing import StandardScaler #type: ignore
from sklearn.preprocessing import OneHotEncoder #type: ignore

import xgboost as xgb #type: ignore

from sklearn.metrics import mean_absolute_error, mean_squared_error #type: ignore
import sklearn.utils as skutils #type: ignore

from scripts.data_preprocessing import ( #type: ignore
    data_to_arrays, 
    calculate_statistics,
)
from scripts.utils import ( #type: ignore
    RandomState,
    MainArgs,
    set_seed,
)
from scripts.save_and_compute import ( #type: ignore
    save_and_return_df,
    interval_to_str,
)

In [None]:
random_state = RandomState(42)
set_seed(42)

In [None]:
args = MainArgs(
)

In [None]:
# Get data

test_data = pd.read_csv('path/to/your/test_data.csv') # load your pre-processed test data

In [None]:
# Scale data

scaler = StandardScaler()
test_data[args.x_cols] = scaler.fit_transform(test_data[args.x_cols]) # a new scaler should be fit to each new dataset

In [None]:
# Get data as 3D arrays

test_data, test_dates, test_ids, test_gender, test_age, test_raw_age = data_to_arrays(args, test_data)

In [None]:
# Get statistics of each feature across n_timesteps as 2D arrays 

test_data = calculate_statistics(test_data)
print(test_data.shape)

In [None]:
# Get gender as features
 
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
test_gender_oh = encoder.fit_transform(test_gender.reshape(-1,1)) # must be in the form 'f', 'm', or 'F', 'M', or 'female', 'male', or 'Female', 'Male'
test_data = np.concatenate([test_data, test_gender_oh], axis=1)

In [None]:
# Scale output

scaler = joblib.load('reg_scaler.pkl')
test_raw_age = scaler.fit_transform(test_raw_age.reshape(-1, 1)).flatten()

In [None]:
# Load model

best_model = xgb.XGBRegressor()
best_model.load_model('age_estimator.json')

In [None]:
# Predict and evaluate the model

predictions = best_model.predict(test_data)
predictions = scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()

# Calculate Mean Absolute Error (MAE)

mae = mean_absolute_error(test_raw_age, predictions)

# Calculate Mean Squared Error (MSE)

mse = mean_squared_error(test_raw_age, predictions)

# Get sample weights and calculate Balanced MAE

sample_weight = skutils.class_weight.compute_sample_weight('balanced', test_age)

bmae = mean_absolute_error(test_raw_age, predictions, sample_weight=sample_weight)

print(f'Results: MAE: {mae:.4f}, Balanced MAE: {bmae:.4f}, MSE: {mse:.4f}')


In [None]:
# Load weighted mean errors

group_weighted_stats = pd.read_csv('weighted_means_age.csv').iloc[:, 1:]
print(group_weighted_stats)

In [None]:
# Compute and save SAI

results = pd.DataFrame({
    'true_age': test_raw_age,
    'pred_age': predictions, 
    'age_bin': test_age,
    'gender': test_gender,
    'ids': test_ids,
    'dates': test_dates[:, 0],
})
results_df = save_and_return_df(data=results, filename='results.csv')

old_bins = [50, 60, 70, 80, np.inf]
bins = [-np.inf, 50, 55, 60, 65, 70, 75, 80, np.inf]

results_df = results_df.merge(group_weighted_stats, on='age_bin')
results_df['age_group_adjusted_pred_age'] = results_df['pred_age'] - results_df['group_weighted_mean']
results_df['age_group_adjusted_difference'] = results_df['age_group_adjusted_pred_age'] - results_df['true_age']
results_df['old_age_bin'] = pd.cut(results_df['true_age'], bins=np.sort(old_bins), right=False, include_lowest=False, ordered=True)
results_df['old_age_bin'] = results_df['old_age_bin'].apply(interval_to_str)
results_df.to_csv('results.csv')
print(results_df)