# Partly Sunny with a Chance of Hashtags

## Modified the metric from `log1p RMSE` to Raw `RMSE` for both Keras and Keras Tuner

## Keras - 1 Attempt

## Attempt 1

In [None]:

import random
import numpy as np
import tensorflow as tf
import pandas as pd
import json
import time
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Load data
train_df = pd.read_csv('crowdflower-weather-twitter/train.csv.zip')
test_df = pd.read_csv('crowdflower-weather-twitter/test.csv.zip')
sub_example = pd.read_csv('crowdflower-weather-twitter/sampleSubmission.csv.zip', nrows=1)
id_col = sub_example.columns[0]
target_columns = list(sub_example.columns[1:])

# Prepare X and y
# Drop rows/columns with all missing in train
train_df = train_df.dropna(axis=1, how='all')

y = train_df[target_columns].values
X = train_df.drop(columns=target_columns + [id_col], errors='ignore')
X = X.dropna(axis=1, how='all')

# Prepare test features and ids
X_test = test_df.drop(columns=target_columns + [id_col], errors='ignore')
test_ids = test_df[id_col].reset_index(drop=True)

# Feature engineering: detect numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = [c for c in X.columns if X[c].dtype=='object' and X[c].nunique() <= 50]

# Preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_cols),
    ('cat', cat_pipeline, categorical_cols)
])

# Fit and transform
X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# Model architecture decision
d, m = X_proc.shape[1], len(target_columns)
n_samples = X_proc.shape[0]

model = Sequential()
if n_samples < 10000 or d < 100:
    # small dataset
    units1 = min(d * 2, 128)
    units2 = min(d, 64)
    model.add(Dense(units1, activation='relu', input_shape=(d,)))
    model.add(Dropout(0.3))
    model.add(Dense(units2, activation='relu'))
    model.add(Dropout(0.3))
else:
    # larger dataset (unused here but provided)
    layers = [min(d * i, 1024) for i in (2, 1, 0.5, 0.25)]
    layers = [u for u in layers if u >= 16]
    model.add(Dense(layers[0], activation='relu', input_shape=(d,)))
    for u in layers[1:]:
        model.add(Dense(u, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(Dropout(0.4))

# Output layer for multi-label classification
model.add(Dense(m, activation='sigmoid'))

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

# Compile model
model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=[mse_real, rmse_real]
)

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss', verbose=1)
]

# Training
start_time = time.time()
history = model.fit(
    X_proc, y,
    validation_split=0.2,
    epochs=100,
    batch_size=128,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# Save results
results = {
    'training_loss': history.history['mse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'training_RMSE': history.history['rmse_real'][-1],
    'validation_RMSE': history.history['val_rmse_real'][-1]
}
with open('Keras/results.json', 'w') as f:
    json.dump(results, f)

# Predictions & Submission
raw_preds = model.predict(X_test_proc)
final = (raw_preds > 0.5).astype(int)

import pandas as pd
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids)
submission.to_csv('Keras/submission_result.csv', index=False)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100

Epoch 1: val_loss improved from inf to 0.69315, saving model to best_model.h5




488/488 - 2s - 4ms/step - loss: 0.6931 - mse_real: 0.4269 - rmse_real: 0.6533 - val_loss: 0.6931 - val_mse_real: 0.4261 - val_rmse_real: 0.6528
Epoch 2/100

Epoch 2: val_loss did not improve from 0.69315
488/488 - 1s - 2ms/step - loss: 0.6931 - mse_real: 0.4269 - rmse_real: 0.6533 - val_loss: 0.6931 - val_mse_real: 0.4261 - val_rmse_real: 0.6528
Epoch 3/100

Epoch 3: val_loss did not improve from 0.69315
488/488 - 1s - 2ms/step - loss: 0.6931 - mse_real: 0.4269 - rmse_real: 0.6533 - val_loss: 0.6931 - val_mse_real: 0.4261 - val_rmse_real: 0.6528
Epoch 4/100

Epoch 4: val_loss did not improve from 0.69315
488/488 - 1s - 2ms/step - loss: 0.6931 - mse_real: 0.4269 - rmse_real: 0.6533 - val_loss: 0.6931 - val_mse_real: 0.4261 - val_rmse_real: 0.6528
Epoch 5/100

Epoch 5: val_loss did not improve from 0.69315
488/488 - 1s - 2ms/step - loss: 0.6931 - mse_real: 0.4269 - rmse_real: 0.6533 - val_loss: 0.6931 - val_mse_real: 0.4261 - val_rmse_real: 0.6528
Epoch 6/100

Epoch 6: val_loss did not i

## Keras Tuner - 1 Attempt

## Attempt 1

In [None]:
import random
import numpy as np
import tensorflow as tf
import pandas as pd
import json
import time
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Load data
train_df = pd.read_csv('crowdflower-weather-twitter/train.csv.zip')
test_df = pd.read_csv('crowdflower-weather-twitter/test.csv.zip')
sub_example = pd.read_csv('crowdflower-weather-twitter/sampleSubmission.csv.zip', nrows=1)
id_col = sub_example.columns[0]
target_columns = list(sub_example.columns[1:])

# Prepare X and y
# Drop rows/columns with all missing in train
train_df = train_df.dropna(axis=1, how='all')

y = train_df[target_columns].values
X = train_df.drop(columns=target_columns + [id_col], errors='ignore')
X = X.dropna(axis=1, how='all')

# Prepare test features and ids
X_test = test_df.drop(columns=target_columns + [id_col], errors='ignore')
test_ids = test_df[id_col].reset_index(drop=True)

# Feature engineering: detect numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = [c for c in X.columns if X[c].dtype=='object' and X[c].nunique() <= 50]

# Preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_cols),
    ('cat', cat_pipeline, categorical_cols)
])

# Fit and transform
X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# Model architecture decision
d, m = X_proc.shape[1], len(target_columns)
n_samples = X_proc.shape[0]

import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall

# Define early stopping and checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# Input dimension
n_features = X_proc.shape[1]

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

# HyperModel
class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, 64)
        act = hp.Choice('activation', ['relu'])
        drop = hp.Float('dropout', 0.0, 0.5)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        inputs = Input(shape=(n_features,))
        x = inputs
        for _ in range(layers):
            x = Dense(units, activation=act)(x)
            x = Dropout(drop)(x)
        x = Dense(m, activation='sigmoid')(x)  # Output layer for multi-label classification
        model = Model(inputs, x)
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[mse_real, rmse_real])
        return model

# Tuner
bs = 64  # batch size
ep = 50  # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    seed=42,
    overwrite=False,
    project_name='bayesian_tuner'
)

if y is not None:
    tuner.search(
        X_proc, y,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_proc, y,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(
    tuner.get_best_hyperparameters(1)[0]
)
start_time = time.time()
# Retrain model with original callbacks and data
if y is not None:
    history = model.fit(
        X_proc, y,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_proc, y,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
duration = time.time() - start_time

# Save results
results = {
    'training_loss': history.history['mse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'training_RMSE': history.history['rmse_real'][-1],
    'validation_RMSE': history.history['val_rmse_real'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# Predictions & Submission
raw_preds = model.predict(X_test_proc)
final = (raw_preds > 0.5).astype(int)

import pandas as pd
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids)
submission.to_csv('submission_result.csv', index=False)

Trial 10 Complete [00h 03m 06s]
val_loss: 0.3097052574157715

Best val_loss So Far: 0.3097052574157715
Total elapsed time: 00h 32m 10s
Epoch 1/100




975/975 - 12s - 13ms/step - loss: 0.5541 - mse_real: 0.2993 - rmse_real: 0.5446 - val_loss: 0.4544 - val_mse_real: 0.2218 - val_rmse_real: 0.4709
Epoch 2/100




975/975 - 10s - 10ms/step - loss: 0.4044 - mse_real: 0.1974 - rmse_real: 0.4440 - val_loss: 0.3681 - val_mse_real: 0.1817 - val_rmse_real: 0.4261
Epoch 3/100




975/975 - 11s - 11ms/step - loss: 0.3492 - mse_real: 0.1778 - rmse_real: 0.4215 - val_loss: 0.3349 - val_mse_real: 0.1739 - val_rmse_real: 0.4169
Epoch 4/100




975/975 - 10s - 10ms/step - loss: 0.3273 - mse_real: 0.1739 - rmse_real: 0.4169 - val_loss: 0.3212 - val_mse_real: 0.1724 - val_rmse_real: 0.4151
Epoch 5/100




975/975 - 10s - 10ms/step - loss: 0.3179 - mse_real: 0.1732 - rmse_real: 0.4160 - val_loss: 0.3152 - val_mse_real: 0.1721 - val_rmse_real: 0.4147
Epoch 6/100




975/975 - 11s - 11ms/step - loss: 0.3137 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3124 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 7/100




975/975 - 10s - 11ms/step - loss: 0.3117 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3110 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 8/100




975/975 - 10s - 10ms/step - loss: 0.3108 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3103 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 9/100




975/975 - 11s - 11ms/step - loss: 0.3103 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3100 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 10/100




975/975 - 10s - 11ms/step - loss: 0.3100 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3098 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 11/100




975/975 - 11s - 11ms/step - loss: 0.3099 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3098 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 12/100




975/975 - 10s - 10ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 13/100




975/975 - 10s - 10ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 14/100




975/975 - 8s - 9ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 15/100




975/975 - 9s - 9ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 16/100




975/975 - 11s - 11ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 17/100




975/975 - 10s - 11ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 18/100




975/975 - 12s - 12ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 19/100




975/975 - 9s - 9ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 20/100
975/975 - 9s - 10ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 21/100




975/975 - 9s - 10ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 22/100




975/975 - 12s - 12ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 23/100
975/975 - 9s - 9ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 24/100
975/975 - 8s - 9ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 25/100
975/975 - 8s - 9ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 26/100
975/975 - 9s - 9ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 27/100
975/975 - 10s - 11ms/step - loss: 0.3098 - mse_real: 0.1730 - rmse_real: 0.4158 - val_loss: 0.3097 - val_mse_real: 0.1720 - val_rmse_real: 0.4146
Epoch 28/100
975/975 - 10s - 11ms/step - loss: 0.3098 - mse_real: 0

In [None]:
print(duration)


323.957296
