## reference
- https://deepctr-doc.readthedocs.io/en/latest/Examples.html#multi-value-input-movielens
- use deepfm

In [33]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.framework.ops import disable_eager_execution

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names

- read and prepare train and test data

In [34]:
movie_df = pd.read_csv("movies.csv")
user_df = pd.read_csv("users.csv")
rating_train = pd.read_csv("rating_train.csv")
rating_test = pd.read_csv("rating_test.csv")

# only keep the movie_id that appears in train set
movie_id_train = pd.DataFrame(rating_train["MovieID"]) \
                   .drop_duplicates(["MovieID"]) \
                   .set_index("MovieID")

movie_df = movie_df.set_index("MovieID").join(
        movie_id_train, 
        how="inner"
    ).reset_index(drop=False)

# extract the year of movie
movie_df["Year"] = movie_df["Title"].apply(lambda x: x[-5:-1])
movie_df["Title"] = movie_df["Title"].apply(lambda x: x[:-6].strip())

In [35]:
# for later embedding
user_id = sorted(user_df["UserID"].unique())

movie_id = sorted(movie_df["MovieID"].unique().tolist())
unknown_movie_id = 0
movie_id.append(unknown_movie_id)

age = sorted(user_df["Age"].unique())
occup_id = sorted(user_df["OccupationID"].unique())
zip_code = sorted(user_df["Zip-code"].unique())
year = ["unknown"] + sorted(movie_df["Year"].unique())
title = ["unknown"] + sorted(movie_df["Title"].unique())
gender = ["F", "M"]

embed_key_dict = {
    "UserID": user_id,
    "MovieID": movie_id,
    "OccupationID": occup_id,
    "Age": age,
    "Zip-code": zip_code,
    "Gender": gender,
    "Title": title,
    "Year": year
}

In [39]:
def prepare_data(raing, users, movies):
    data = raing.set_index("MovieID").join(
        movies.set_index("MovieID"), 
        how="left"
    )
    data = data.reset_index(drop=False).set_index("UserID").join(
        users.set_index("UserID"), 
        how="left"
    ).reset_index(drop=False)
    
    return data

In [40]:
train = prepare_data(rating_train, user_df, movie_df)
test = prepare_data(rating_test, user_df, movie_df)

# fill missing Title ad Genres in test as "unknown"
test["Year"] = test["Year"].fillna("unknown")
test["Title"] = test["Title"].fillna("unknown")
test["Genres"] = test["Genres"].fillna("unknown")

- label encoding for Gender, Title and Year

In [41]:
sparse_features = ["MovieID", "UserID", "OccupationID", "Gender", "Age", "Zip-code", "Title", "Year"]
target = ['Rating']

In [42]:
for feat in sparse_features:
    print("embed for %s" % feat)
    lbe = LabelEncoder()
    lbe.fit(embed_key_dict[feat])
    train[feat] = lbe.transform(train[feat])
    if feat == "MovieID":
        test[feat] = test[feat].map(lambda s: unknown_movie_id if s not in lbe.classes_ else s)
    test[feat] = lbe.transform(test[feat])

embed for MovieID
embed for UserID
embed for OccupationID
embed for Gender
embed for Age
embed for Zip-code
embed for Title
embed for Year


- embed Genres

In [43]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in genre_key2index:
            genre_key2index[key] = len(genre_key2index) + 1
    return list(map(lambda x: genre_key2index[x], key_ans))

In [44]:
# preprocess the sequence feature, add NA type for missing/unknown Genres
genre_key2index = {"unknown": 1}
train_genres_list = list(map(split, train['Genres'].values.tolist()))
train_genres_length = np.array(list(map(len, train_genres_list)))
train_genre_max_len = max(train_genres_length)
train_genres_list = pad_sequences(train_genres_list, maxlen=train_genre_max_len, padding='post', )

In [45]:
test_genres_list = list(map(split, test['Genres'].values.tolist()))
test_genres_length = np.array(list(map(len, test_genres_list)))
test_genre_max_len = max(test_genres_length)
test_genres_list = pad_sequences(test_genres_list, maxlen=test_genre_max_len, padding='post', )

- prepare for feature columns

In [59]:
fixlen_feature_columns = [SparseFeat(
    feat, 
    train[feat].max() + 2, 
    embedding_dim=4) for feat in sparse_features
]

varlen_feature_columns = [VarLenSparseFeat(
    SparseFeat('Genres', 
               vocabulary_size=len(genre_key2index) + 1, 
               embedding_dim=4), 
    maxlen=train_genre_max_len, combiner='mean', weight_name=None)
]

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [60]:
# 3.generate input data for model
train_model_input = {name: train[name] for name in feature_names}  #
train_model_input["Genres"] = train_genres_list

test_model_input = {name: test[name] for name in feature_names}  
test_model_input["Genres"] = test_genres_list

In [61]:
mode = "train"

In [64]:
n_epochs = 20

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression',
              dnn_hidden_units=(256, 256, 256), l2_reg_linear=1e-4, l2_reg_embedding=1e-5, l2_reg_dnn=1e-4)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="model/deep_fm_epoch_%d" % n_epochs,
    save_weights_only=True, verbose=1
    )
opt = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.8)
model.compile(loss='mse', optimizer=opt, metrics=['mse'])

In [65]:
if mode == "train":
    history = model.fit(train_model_input, train[target].values,
                        batch_size=128, epochs=n_epochs, verbose=2, validation_split=0.3,
                        callbacks=[cp_callback])

Train on 560135 samples, validate on 240058 samples
Epoch 1/20

Epoch 00001: saving model to model/deep_fm_epoch_20
560135/560135 - 20s - loss: 1.2707 - mse: 1.2132 - val_loss: 1.1760 - val_mse: 1.1186
Epoch 2/20

Epoch 00002: saving model to model/deep_fm_epoch_20
560135/560135 - 18s - loss: 1.0391 - mse: 0.9816 - val_loss: 1.0485 - val_mse: 0.9909
Epoch 3/20

Epoch 00003: saving model to model/deep_fm_epoch_20
560135/560135 - 18s - loss: 0.9021 - mse: 0.8446 - val_loss: 1.0369 - val_mse: 0.9796
Epoch 4/20

Epoch 00004: saving model to model/deep_fm_epoch_20
560135/560135 - 18s - loss: 0.8828 - mse: 0.8257 - val_loss: 1.0508 - val_mse: 0.9940
Epoch 5/20

Epoch 00005: saving model to model/deep_fm_epoch_20
560135/560135 - 17s - loss: 0.8756 - mse: 0.8190 - val_loss: 1.0322 - val_mse: 0.9758
Epoch 6/20

Epoch 00006: saving model to model/deep_fm_epoch_20
560135/560135 - 17s - loss: 0.8707 - mse: 0.8145 - val_loss: 1.0315 - val_mse: 0.9756
Epoch 7/20

Epoch 00007: saving model to model/d

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [66]:
model.load_weights("model/deep_fm_epoch_%d" % n_epochs)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbeaeeff9b0>

In [67]:
y_pred = model.predict(test_model_input)

In [79]:
y_pred = y_pred.reshape(-1)

In [80]:
y_pred.shape

(200016,)

In [86]:
rating_test["Rating"] = y_pred
rating_test["Rating"] = rating_test["Rating"].round()
output_df = rating_test[["UserID", "MovieID", "Rating"]]
output_df.to_csv("Q5_output.csv", index=False, header=True)

In [87]:
rating_test["Rating"].value_counts()

4.0    119521
3.0     55829
5.0     12465
2.0     11275
1.0       925
6.0         1
Name: Rating, dtype: int64