# imports


In [1]:
import numpy as np
import pandas as pd
import os
import json
import tensorflow as tf


from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Flatten, Dropout, Activation
from tensorflow.keras import regularizers

 Load and preprocess search history data

In [None]:
search_data = pd.read_json('search_history.json')
search_data['searchTime'] = pd.to_datetime(search_data['searchTime'])
search_data['hour_of_day'] = search_data['searchTime'].dt.hour
search_data['day_of_week'] = search_data['searchTime'].dt.dayofweek
search_data['query_length'] = search_data['searchQuery'].apply(lambda x: len(x))

Preprocess streaming history data

In [None]:
streaming_data = pd.read_json('streaming_history.json')
streaming_data['endTime'] = pd.to_datetime(streaming_data['endTime'])

Load user demographics data

In [None]:
demographics_data = pd.read_json('user_demographics.json')

Preprocess user demographics data

In [None]:
demographics_data['birthdate'] = pd.to_datetime(demographics_data['birthdate'])
demographics_data['age'] = (pd.to_datetime('today') - demographics_data['birthdate']).astype('<m8[Y]')
demographics_data['gender'] = LabelEncoder().fit_transform(demographics_data['gender'])

Merge data

In [None]:
merged_data = pd.merge(search_data, streaming_data, how='outer', on='username')
merged_data = pd.merge(merged_data, demographics_data, how='inner', on='username')

Encode categorical variables

In [None]:
merged_data['platform'] = LabelEncoder().fit_transform(merged_data['platform'])

Normalize numerical features

In [None]:
scaler = MinMaxScaler()
merged_data[['msPlayed']] = scaler.fit_transform(merged_data[['msPlayed']])

Split data into train and test sets

In [None]:
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

Prepare inputs for LSTM architecture

In [None]:
search_inputs = train_data[['hour_of_day', 'day_of_week', 'query_length']]
search_inputs = np.asarray(search_inputs)
search_inputs = np.expand_dims(search_inputs, axis=2)

Prepare inputs for collaborative filtering

In [None]:
user_inputs = train_data[['age', 'gender', 'country']]
user_inputs = np.asarray(user_inputs)
user_inputs = to_categorical(user_inputs)

Prepare targets for collaborative filtering

In [None]:
target_inputs = train_data[['msPlayed']]
target_inputs = np.asarray(target_inputs)

LSTM architecture for search history

In [None]:
lstm_input = Input(shape=(3, 1))
lstm_output = LSTM(64)(lstm_input)
lstm_output = Dense(32, activation='relu')(lstm_output)

 Autoencoder architecture for collaborative filtering


In [None]:
user_input = Input(shape=(user_inputs.shape[1],))
user_embedding = Dense(64, activation='relu')(user_input)

target_input = Input(shape=(target_inputs.shape[1],))
target_embedding = Dense(64, activation='relu')(target_input)

merged_layer = Concatenate()([user_embedding, target_embedding])
encoded = Dense(32, activation='relu', activity_regularizer=regularizers.l1(10e-5))(merged_layer)
decoded = Dense(64, activation='relu')(encoded)


Compile the model

In [None]:
autoencoder = Model(inputs=[user_input, target_input], outputs=decoded)
autoencoder.compile(optimizer=Adam(lr=0.001), loss=MeanSquaredError())

Train the model

In [None]:
autoencoder.fit([user_inputs, target_inputs], target_inputs, epochs=10, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(patience=3)])

Extract embeddings from the trained model

In [None]:
embedding_model = Model(inputs=user_input, outputs=user_embedding)
user_embeddings = embedding_model.predict(user_inputs)