In [1]:
import config as cfg
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from pathlib import Path
from sklearn.model_selection import train_test_split
import pickle

In [2]:
from contextlib import suppress


clean_data = pd.read_pickle(cfg.PATHS.TRANSFORMED_DATA_PICKLE)
with suppress(Exception):
    clean_data.drop("text", axis=1, inplace=True)
with open(cfg.PATHS.EMBEDDINGS, "rb") as f:
    embeddings = pickle.load(f)

In [3]:
len(embeddings), len(clean_data)


(223152, 223152)

In [4]:
# Split the data into training and testing sets
X = clean_data.drop(["useful", "funny", "cool"], axis=1)
y = clean_data[["useful", "funny", "cool"]]

# log transform the target variables
y = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
embeddings_train = embeddings[: X_train.shape[0]]
embeddings_test = embeddings[X_train.shape[0] :]

# Keras Model

In [5]:
import keras.backend as K
import tensorflow as tf


def custom_loss(y_true, y_pred):
    # Compute the weight for each output based on the corresponding non-zero value of the predictor
    predictor_weight = K.cast(K.not_equal(y_true, 0), dtype="float32")
    # Compute the weighted mean squared error loss
    mse = K.mean(K.square(y_true - y_pred) * predictor_weight, axis=-1)
    return mse

In [6]:
from keras.layers import Input, Dense, Concatenate, Flatten
from keras.models import Model
from keras.optimizers import Adam

# Define the input layers
embedding_dim = embeddings.shape[1]
num_predictors = X.shape[1]
review_input = Input(shape=(embedding_dim,))
predictor_input = Input(shape=(num_predictors,))

# Concatenate the review text embedding and predictor inputs
concatenated = Concatenate()([review_input, predictor_input])

# Define the fully connected layers
fc0 = Dense(256, activation="relu")(concatenated)
fc1 = Dense(128, activation="relu")(fc0)
fc2 = Dense(64, activation="relu")(fc1)
fc3 = Dense(32, activation="relu")(fc2)

# Define the output layers
helpful_output = Dense(1, name="useful")(fc3)
funny_output = Dense(1, name="funny")(fc3)
cool_output = Dense(1, name="cool")(fc3)

# Define the model with multiple outputs
model = Model(inputs=[review_input, predictor_input], outputs=[helpful_output, funny_output, cool_output])

# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss=custom_loss, metrics=["mae"])


  super().__init__(name, **kwargs)


In [7]:
# Fit the model with numpy arrays where appropriate
import tensorflow as tf


model.fit(
    [embeddings_train, np.array(X_train, "float32")],
    [
        np.array(y_train["useful"], "float32"),
        np.array(y_train["funny"], "float32"),
        np.array(y_train["cool"], "float32"),
    ],
    epochs=10,
    batch_size=32,
)

Epoch 1/10


2023-04-14 10:51:12.739551: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x154892a30>

In [8]:
# test the model
model.evaluate(
    [embeddings_test, np.array(X_test, "float32")],
    [
        np.array(y_test["useful"], "float32"),
        np.array(y_test["funny"], "float32"),
        np.array(y_test["cool"], "float32"),
    ],
    batch_size=32,
)



[0.7229881286621094,
 0.21914631128311157,
 0.2856139838695526,
 0.21822772920131683,
 0.35215380787849426,
 0.5724189281463623,
 0.47042757272720337]

In [9]:
model.predict(
    [embeddings_test, np.array(X_test, "float32")],
    batch_size=32,
)



[array([[2.1335125],
        [2.4742978],
        [2.473797 ],
        ...,
        [2.1769643],
        [2.2657447],
        [2.2641191]], dtype=float32),
 array([[1.4107097],
        [1.585256 ],
        [1.3632984],
        ...,
        [1.3452603],
        [1.6139908],
        [1.7633919]], dtype=float32),
 array([[1.7412698],
        [2.3082914],
        [1.0105609],
        ...,
        [1.9044187],
        [1.889236 ],
        [2.0315876]], dtype=float32)]