In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [2]:
# Load dataset
df = pd.read_csv('/kaggle/input/aq-bench/AQbench_dataset.csv')
df.head()


Unnamed: 0,id,country,htap_region,climatic_zone,lon,lat,alt,relative_alt,type,type_of_area,...,o3_perc90,o3_perc98,o3_dma8eu,o3_avgdma8epax,o3_drmdmax1h,o3_w90,o3_aot40,o3_nvgt070,o3_nvgt100,dataset
0,3336,Germany,EUR,cool_moist,8.30821,54.92497,12.0,3,background,rural,...,46.4399,54.8468,53.5738,38.8078,50.7704,86.1266,10197.4742,2.0,0.0,test
1,3338,Germany,EUR,cool_moist,12.72528,54.43667,1.0,1,background,rural,...,44.0575,53.7778,51.3996,35.8313,48.3935,69.0987,7573.2222,1.0,0.0,train
2,3339,Germany,EUR,cool_moist,6.093923,50.754704,205.0,66,background,urban,...,41.1803,58.4009,54.903,32.6169,49.8276,154.1263,8655.473,5.4,1.0,train
3,3340,Germany,EUR,cool_moist,8.548389,52.023169,102.0,29,background,urban,...,38.173,53.5109,50.1112,28.6179,46.2694,120.2575,6036.5851,2.6,0.0,test
4,3341,Germany,EUR,cool_moist,6.874554,51.862,45.0,8,background,rural,...,40.6623,58.4082,53.9564,31.0051,50.6809,203.4584,9045.4745,6.8,1.2,train


In [3]:
# Label Encoding
str_columns = df.select_dtypes(include=['object']).columns.tolist()
str_columns = [col for col in str_columns if col != 'dataset']
label_encoders = {}
for col in str_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
    label_encoders[col] = le
df.head()


Unnamed: 0,id,country,htap_region,climatic_zone,lon,lat,alt,relative_alt,type,type_of_area,...,o3_perc90,o3_perc98,o3_dma8eu,o3_avgdma8epax,o3_drmdmax1h,o3_w90,o3_aot40,o3_nvgt070,o3_nvgt100,dataset
0,3336,26,1,3,8.30821,54.92497,12.0,3,0,1,...,46.4399,54.8468,53.5738,38.8078,50.7704,86.1266,10197.4742,2.0,0.0,test
1,3338,26,1,3,12.72528,54.43667,1.0,1,0,1,...,44.0575,53.7778,51.3996,35.8313,48.3935,69.0987,7573.2222,1.0,0.0,train
2,3339,26,1,3,6.093923,50.754704,205.0,66,0,4,...,41.1803,58.4009,54.903,32.6169,49.8276,154.1263,8655.473,5.4,1.0,train
3,3340,26,1,3,8.548389,52.023169,102.0,29,0,4,...,38.173,53.5109,50.1112,28.6179,46.2694,120.2575,6036.5851,2.6,0.0,test
4,3341,26,1,3,6.874554,51.862,45.0,8,0,1,...,40.6623,58.4082,53.9564,31.0051,50.6809,203.4584,9045.4745,6.8,1.2,train


In [4]:
def sine_cosine_encode(values, period=None):
    values_array = np.array(values)
    if period is None:
        period = values_array.max()
    sin_values = np.sin(2 * np.pi * values_array / period)
    cos_values = np.cos(2 * np.pi * values_array / period)
    return sin_values, cos_values
df['lonx'], df['lony'] = sine_cosine_encode(df['lon'], period=360)
df = df.drop('lon', axis=1)
df.head()


Unnamed: 0,id,country,htap_region,climatic_zone,lat,alt,relative_alt,type,type_of_area,water_25km,...,o3_dma8eu,o3_avgdma8epax,o3_drmdmax1h,o3_w90,o3_aot40,o3_nvgt070,o3_nvgt100,dataset,lonx,lony
0,3336,26,1,3,54.92497,12.0,3,0,1,86.1,...,53.5738,38.8078,50.7704,86.1266,10197.4742,2.0,0.0,test,0.144498,0.989505
1,3338,26,1,3,54.43667,1.0,1,0,1,55.7,...,51.3996,35.8313,48.3935,69.0987,7573.2222,1.0,0.0,train,0.220277,0.975437
2,3339,26,1,3,50.754704,205.0,66,0,4,0.0,...,54.903,32.6169,49.8276,154.1263,8655.473,5.4,1.0,train,0.106159,0.994349
3,3340,26,1,3,52.023169,102.0,29,0,4,0.0,...,50.1112,28.6179,46.2694,120.2575,6036.5851,2.6,0.0,test,0.148645,0.988891
4,3341,26,1,3,51.862,45.0,8,0,1,0.0,...,53.9564,31.0051,50.6809,203.4584,9045.4745,6.8,1.2,train,0.119696,0.992811


In [5]:
var_df = pd.read_csv('/kaggle/input/aq-bench/AQbench_variables.csv')
input_cols = var_df.loc[(var_df['input_target'] == 'input') & (var_df['column_name'] != 'lon'), 'column_name'].tolist()
if 'lon' in input_cols:
    input_cols.remove('lon')
input_cols += ['lonx', 'lony']
target_cols = var_df.loc[var_df['input_target'] == 'target', 'column_name'].tolist()
x_train = df[df['dataset'] == 'train'][input_cols]
y_train = df[df['dataset'] == 'train'][target_cols]
x_test = df[df['dataset'] == 'test'][input_cols]
y_test = df[df['dataset'] == 'test'][target_cols]
x_val = df[df['dataset'] == 'val'][input_cols]
y_val = df[df['dataset'] == 'val'][target_cols]
df = df.drop('dataset', axis=1)
x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape


((3348, 37), (3348, 15), (1115, 37), (1115, 15), (1114, 37), (1114, 15))

In [6]:
# Kaggle TPU-compatible TensorFlow model generator
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
hyper_df = pd.read_csv('/kaggle/input/aq-bench/hyperparameters.csv')
# Connect to TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU')
except Exception as e:
    tpu_strategy = tf.distribute.get_strategy()
    print('TPU not found, defaulting to', tpu_strategy)
def build_model(target):
    row = hyper_df[hyper_df['column_name'] == target].iloc[0]
    hidden_layers = eval(row['hidden layers'])
    activation = row['activation']
    loss = row['loss']
    lr = float(row['learning rate'])
    l2 = float(row['L2 lambda'])
    input_dim = x_train.shape[1]
    with tpu_strategy.scope():
        model = keras.Sequential()
        for h in hidden_layers:
            model.add(layers.Dense(h, activation=activation, kernel_regularizer=keras.regularizers.l2(l2)))
        model.add(layers.Dense(1))
        model.build((None, input_dim))
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), loss=loss)
    return model
# Example: pick a target and build/fit using its hyperparameters
model = build_model('o3_average_values')
history = model.fit(x_train, y_train['o3_average_values'],
    batch_size=int(hyper_df[hyper_df['column_name']== 'o3_average_values']['batch size']),
    epochs=int(hyper_df[hyper_df['column_name']== 'o3_average_values']['epochs']),
    validation_data=(x_val, y_val['o3_average_values'])
)




NameError: name 'strategy' is not defined