In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
series_folder = '/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet'
train_csv_path = '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'
test_csv_path='/kaggle/input/child-mind-institute-problematic-internet-use/test.csv'

In [None]:
test_df= pd.read_csv(test_csv_path)
columns= test_df.columns

In [None]:
train_df= pd.read_csv(train_csv_path)
train_columns= train_df.columns

In [None]:
for i in train_columns:
    if i not in columns:
        print(i)

In [None]:
def load_and_preprocess_tabular_data(csv_path,columns=columns):
    df = pd.read_csv(csv_path)
    extras=[]
    for i in df.columns:
        if i not in columns:
            extras.append(i)
    extras.append('id')
    # Separate labels (sii)
    if 'sii' in df.columns:
        y = df['sii']
        X = df.drop(columns=extras)  # Remove 'id' and 'sii' for preprocessing
    else:
        y= None
        X = df.drop(columns=['id'])
    # Identify numeric and categorical columns
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = X.select_dtypes(include=['object']).columns

    # Convert numeric columns to float (in case they contain string numbers)
    X[numeric_cols] = X[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Fill missing values: Numeric columns filled with mean, Categorical with mode
    X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())
    X[categorical_cols] = X[categorical_cols].fillna(X[categorical_cols].mode().iloc[0])
    
    # Label encode categorical columns
    for col in categorical_cols:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))  # Handle string types safely
    
    return X, y, df['id']

In [None]:
def scale_features(X_train, X_test=None, scaler=None):
    # Create a new scaler if one is not provided (for training data)
    if scaler is None:
        scaler = StandardScaler()

    # Fit and transform training data
    X_train_scaled = scaler.fit_transform(X_train)

    # Check if test data is provided, if so, transform it using the fitted scaler
    if X_test is not None:
        X_test_scaled = scaler.transform(X_test)
        return X_train_scaled, X_test_scaled, scaler
    else:
        return X_train_scaled, None, scaler


In [None]:
def load_time_series_data(series_folder, ids, max_timesteps=500):
    time_series_data = []

    for _id in ids:
        series_path = os.path.join(series_folder, f"id={_id}/part-0.parquet")
        if os.path.exists(series_path):
            series_df = pd.read_parquet(series_path)
            series_df.fillna(0, inplace=True)  # Fill NaN values in time-series data
            
            # Truncate or pad time series to the same length
            truncated_series = series_df[['X', 'Y', 'Z', 'enmo']].values[:max_timesteps]
            if truncated_series.shape[0] < max_timesteps:
                padding = np.zeros((max_timesteps - truncated_series.shape[0], 4))
                truncated_series = np.vstack([truncated_series, padding])
            time_series_data.append(truncated_series)
        else:
            # If no data for the ID, use all zeros
            time_series_data.append(np.zeros((max_timesteps, 4)))
    
    return np.array(time_series_data)

In [None]:
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import cohen_kappa_score
import numpy as np

class QWKCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        # Get the validation data
        val_data = self.validation_data
        val_pred = self.model.predict([val_data[0], val_data[1]])  # Tabular and Time-Series inputs
        val_true = val_data[2]  # True validation labels
        
        # Get the predicted classes
        val_pred_classes = np.argmax(val_pred, axis=-1)
        
        # Calculate QWK
        qwk = cohen_kappa_score(val_true, val_pred_classes, weights="quadratic")
        
        # Log QWK in the logs dictionary
        print(f"\nEpoch {epoch + 1}: QWK = {qwk:.4f}")
        logs['qwk'] = qwk


In [None]:
from tensorflow.keras import layers, models, regularizers

def build_dual_head_model(input_shape_tabular, input_shape_series):
    input_shape_tabular = (input_shape_tabular,)  # Convert scalar to tuple (e.g., (80,))
    
    # Tabular input head
    input_tabular = layers.Input(shape=input_shape_tabular, name="tabular_input")
    x1 = layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001))(input_tabular)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.3)(x1)  # Add dropout to avoid overfitting
    x1 = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.3)(x1)
    x1 = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x1)
    x1 = layers.BatchNormalization()(x1)

    # Time-series input head using LSTM
    input_series = layers.Input(shape=input_shape_series, name="time_series_input")
    x2 = layers.LSTM(64, return_sequences=True)(input_series)  # Replacing Conv1D with LSTM
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Dropout(0.3)(x2)  # Add dropout to avoid overfitting
    x2 = layers.LSTM(32)(x2)
    x2 = layers.BatchNormalization()(x2)

    # Concatenate both heads
    concatenated = layers.concatenate([x1, x2])
    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001))(concatenated)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    output = layers.Dense(4, activation='softmax')(x)  # Assuming 4-class classification

    model = models.Model(inputs=[input_tabular, input_series], outputs=output)
    
    return model



In [None]:
def custom_loss(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)  # Ignore NaN labels (-1 in this case)
    loss = tf.keras.losses.SparseCategoricalCrossentropy()(y_true, y_pred)
    return tf.reduce_mean(loss * mask)

# 6. Compile the model
def compile_model(model):
    # Compile the model with RMSprop
    opt = optimizers.Adam(learning_rate=1e-3)  # Try RMSprop
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

In [None]:
def train_model(model, X_tabular_train, X_series_train, y_train, epochs=20, batch_size=32):
    history = model.fit([X_tabular_train, X_series_train], y_train, 
                        epochs=epochs, 
                        batch_size=batch_size, 
                        validation_split=0.2)
    return history

# 8. Inference and saving predictions
def predict_and_save(model, X_tabular_test, X_series_test, ids_test, output_csv):
    predictions = model.predict([X_tabular_test, X_series_test])
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Create DataFrame for predictions
    result_df = pd.DataFrame({
        'id': ids_test,
        'sii': predicted_labels
    })
    
    # Save to CSV
    result_df.to_csv(output_csv, index=False)

In [None]:
X_tabular, y_train, ids_train = load_and_preprocess_tabular_data(train_csv_path)
    
    # Separate out labeled data (drop NaN labels)
labeled_mask = ~y_train.isna()
X_tabular_train = X_tabular[labeled_mask]
y_train = y_train[labeled_mask].astype(int)  # Drop NaNs in y_train
    
    # 2. Scale features
X_tabular_train_scaled, _, scaler = scale_features(X_tabular_train)

    # 3. Load corresponding time-series data
X_series_train = load_time_series_data(series_folder, ids_train[labeled_mask])

    # 4. Prepare the shapes for the model
input_shape_tabular = X_tabular_train_scaled.shape[1]  # Number of tabular features
input_shape_series = X_series_train.shape[1:]  # Shape of time-series (timesteps, features)


In [None]:
model = build_dual_head_model(input_shape_tabular, input_shape_series)
model = compile_model(model)

In [None]:
# Check tabular data for NaN/Inf
print(f"Tabular data has NaN: {np.any(np.isnan(X_tabular_train_scaled))}")
print(f"Tabular data has Inf: {np.any(np.isinf(X_tabular_train_scaled))}")

# Check time-series data for NaN/Inf
print(f"Time-series data has NaN: {np.any(np.isnan(X_series_train))}")
print(f"Time-series data has Inf: {np.any(np.isinf(X_series_train))}")

# Check labels for NaN/Inf
print(f"Labels have NaN: {np.any(np.isnan(y_train))}")
print(f"Labels have Inf: {np.any(np.isinf(y_train))}")


In [None]:
X_tabular_train_scaled.max()

In [None]:
print(np.unique(y_train)) 

In [None]:
history = train_model(model, X_tabular_train_scaled, X_series_train, y_train, epochs=20, batch_size=32)

In [None]:
X_tabular_test, _, ids_test = load_and_preprocess_tabular_data(test_csv_path)  # No sii column here
X_tabular_test_scaled, _, _ = scale_features(X_tabular_test, scaler=scaler)

In [None]:
X_series_test = load_time_series_data('/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet', ids_test)

In [None]:
predict_and_save(model, X_tabular_test_scaled, X_series_test, ids_test, 'submission.csv')