In [None]:
import pandas as pd
import os

from sklearn.preprocessing import MultiLabelBinarizer

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from itertools import chain
import torch

from deepctr_torch.models import DCN
from deepctr_torch.inputs import (
    SparseFeat, DenseFeat,
    VarLenSparseFeat, get_feature_names
)


def pad_genres(seqs, maxlen):
    padded = np.zeros((len(seqs), maxlen), dtype='int32')
    for i, seq in enumerate(seqs):
        length = min(len(seq), maxlen)
        padded[i, :length] = seq[:length]
    return padded

# read movies.dat from movielens 1m dataset
def read_movies_1m():
    file_path = os.path.join('..', 'data', 'ml-1m', 'movies.dat')
    movies = pd.read_csv(file_path, sep='::', header=None, names=['movieId', 'title', 'genres'], engine='python', encoding='latin-1')

    # transforma the genres column into a list of genres
    movies['genres_list'] = movies['genres'].str.split('|')

    mlb = MultiLabelBinarizer()
    genres_encoded = mlb.fit_transform(movies['genres_list'])
    ohe_genres = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=movies.index)
    movies = pd.concat([movies, ohe_genres], axis=1)
    return movies

# read ratings.dat from movielens 1m dataset
def read_ratings_1m():
    file_path = os.path.join('..', 'data', 'ml-1m', 'ratings.dat')
    ratings = pd.read_csv(file_path, sep='::', header=None, names=['userId', 'movieId', 'rating', 'timestamp'], engine='python', encoding='latin-1')

    ratings['label'] = ratings['rating'] >= 4
    ratings.drop(columns='rating', inplace=True)
    # Convert timestamp to datetime and extract information about the day of the week/hour and so on
    ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
    ratings['day_of_week'] = ratings['timestamp'].dt.dayofweek
    ratings['hour'] = ratings['timestamp'].dt.hour
    ratings.drop(columns='timestamp', inplace=True)


    return ratings

# read users.dat from movielens 1m dataset
def read_users_1m():
    file_path = os.path.join('..', 'data', 'ml-1m', 'users.dat')
    users = pd.read_csv(file_path, sep='::', header=None, names=['userId', 'gender', 'age', 'occupation', 'zip'], engine='python', encoding='latin-1')
    return users

# Read the datasets
movies = read_movies_1m()
ratings = read_ratings_1m()
users = read_users_1m()

In [29]:
# Merge the datasets into a single DataFrame
df = (ratings
        .merge(users, on="userId")
        .merge(movies, on="movieId"))

sparse_features = ["userId", "movieId", "occupation", "gender", "zip"]

for f in sparse_features:
    df[f] = LabelEncoder().fit_transform(df[f])

# rescale to [0,1] so it isn’t dwarfed by embeddings
df["age"] = MinMaxScaler().fit_transform(df[["age"]])

# ▶ variable-length multi-hot feature – movie genres
df["genres_list"] = df["genres"].str.split("|")
all_genres = sorted(set(chain.from_iterable(df["genres_list"])))
genre2idx = {g: i + 1 for i, g in enumerate(all_genres)}   # 0 reserved for padding
df["genres_idx"] = df["genres_list"].apply(lambda lst: [genre2idx[g] for g in lst])
max_genre_len = df["genres_idx"].str.len().max()
df["genres"] = pad_genres(df["genres_idx"].tolist(), max_genre_len)

In [40]:
df["genres_idx"]

0                    [8]
1             [3, 4, 12]
2               [12, 14]
3                    [8]
4              [3, 4, 5]
               ...      
1000204              [5]
1000205      [8, 14, 17]
1000206           [5, 8]
1000207              [8]
1000208    [4, 8, 9, 15]
Name: genres_idx, Length: 1000209, dtype: object

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(df)


0           8
1           3
2          12
3           8
4           3
           ..
1000204     5
1000205     8
1000206     5
1000207     8
1000208     4
Name: genres, Length: 1000209, dtype: int32

In [37]:
df['genres'].sample(30)

956410     8
621889     5
796644     1
294334     8
893817     8
588955    15
271287     1
4289       7
995541     3
634063     1
356064     5
925714     8
753244     5
718852     1
889769     2
910484     1
576132     1
299702     7
497767     1
881066     5
875267     3
28365      8
911651     5
719996     5
873580     6
828903     1
428235    11
873749     8
648504     1
546856     5
Name: genres, dtype: int32

In [16]:
[SparseFeat(f, df[f].nunique(), embedding_dim=8) for f in sparse_features] + [DenseFeat("age", 1)]

[SparseFeat(name='userId', vocabulary_size=6040, embedding_dim=8, use_hash=False, dtype='int32', embedding_name='userId', group_name='default_group'),
 SparseFeat(name='movieId', vocabulary_size=3706, embedding_dim=8, use_hash=False, dtype='int32', embedding_name='movieId', group_name='default_group'),
 SparseFeat(name='occupation', vocabulary_size=21, embedding_dim=8, use_hash=False, dtype='int32', embedding_name='occupation', group_name='default_group'),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=8, use_hash=False, dtype='int32', embedding_name='gender', group_name='default_group'),
 SparseFeat(name='zip', vocabulary_size=3439, embedding_dim=8, use_hash=False, dtype='int32', embedding_name='zip', group_name='default_group'),
 DenseFeat(name='age', dimension=1, dtype='float32')]

In [33]:
# 4. Build DeepCTR feature columns
# -------------------------------------------------
fixlen_feats = [SparseFeat(f, df[f].nunique(), embedding_dim=8) for f in sparse_features] + [DenseFeat("age", 1)]

varlen_feats = [
    VarLenSparseFeat(
        SparseFeat("genres", vocabulary_size=len(genre2idx) + 1, embedding_dim=8),
        maxlen=max_genre_len,
        combiner="mean"
    )
]

linear_feature_columns = fixlen_feats + varlen_feats
dnn_feature_columns    = fixlen_feats + varlen_feats
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 5. Train / test split and input dicts
# -------------------------------------------------
train, test = train_test_split(df, test_size=0.2, random_state=42)

def build_inputs(df_part):
    X = {name: df_part[name].values                 # sparse & dense
         for name in sparse_features + ["age"]}
    X["genres"] = df_part["genres_idx"].tolist()    # var-len needs list-of-lists
    return X

train_input = build_inputs(train)
test_input  = build_inputs(test)

train_model_input = {name: train[name].to_numpy(dtype="int32" if name in sparse_feats else "float32")
                     for name in feature_names}

# 6. Compile & train DCN
# -------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

model = DCN(
    linear_feature_columns,
    dnn_feature_columns,
    task="binary",
    device=device,
    # optional hyper-params
    dnn_hidden_units=(128, 128),
    cross_num=3
)

model.compile("adam",
              "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"])


model.fit(
    #train_input, train["label"].values,
    train_model_input, train["label"].to_numpy(),
    batch_size=1024,
    epochs=5,
    verbose=2,
    validation_split=0.1
)

# 7. Evaluate
# -------------------------------------------------
preds = model.predict(test_input, batch_size=1024)
print("Test AUC:", roc_auc_score(test["label"].values, preds))

NameError: name 'sparse_feats' is not defined

In [31]:
for k, v in enumerate(train_input):
    print(k, type(v), getattr(v, "shape", None))

0 <class 'str'> None
1 <class 'str'> None
2 <class 'str'> None
3 <class 'str'> None
4 <class 'str'> None
5 <class 'str'> None
6 <class 'str'> None


In [28]:
train_input['genres'].

SyntaxError: invalid syntax (1572661882.py, line 1)

In [33]:
train_input['genres']

[[5, 8, 17],
 [16],
 [1, 16],
 [5, 8],
 [8],
 [1, 6],
 [8],
 [5, 14],
 [15, 16],
 [8, 17],
 [1],
 [5, 8],
 [6, 8],
 [5],
 [8],
 [5, 6, 9],
 [1, 8],
 [5, 8, 14],
 [16],
 [1, 2, 15],
 [8],
 [5, 6],
 [1, 5, 18],
 [1, 6, 16],
 [5, 8],
 [8, 17],
 [5],
 [8, 17],
 [8, 17],
 [8, 17],
 [8, 13, 16],
 [5, 15],
 [3, 5],
 [8],
 [8, 12],
 [8],
 [5],
 [5, 8],
 [1, 16],
 [2, 5],
 [1, 8],
 [8, 17],
 [6, 8],
 [4, 5, 9],
 [1, 5, 18],
 [8],
 [1, 18],
 [1, 15],
 [1, 6, 8],
 [6, 16],
 [1, 2, 5, 14],
 [5],
 [8],
 [5, 8],
 [8, 14],
 [1, 16],
 [8, 16],
 [1],
 [5, 8],
 [5, 8],
 [1, 15],
 [8],
 [8, 16],
 [1, 8],
 [10, 15],
 [3, 4, 5],
 [6, 8],
 [5],
 [5],
 [1, 2, 15],
 [11],
 [5],
 [8],
 [5, 8],
 [1, 15, 16],
 [1, 15, 16],
 [2, 9, 14],
 [1, 2, 16],
 [8],
 [6, 8, 15],
 [5],
 [1, 11],
 [11, 16],
 [5, 14],
 [1, 2, 16],
 [16],
 [11, 15, 16],
 [1, 2, 14, 15, 17],
 [5, 15],
 [15],
 [5, 17],
 [5],
 [5, 8, 14],
 [5],
 [15, 16],
 [2, 5, 12],
 [8],
 [3, 4, 12],
 [1, 15, 16, 18],
 [5],
 [8, 16],
 [8],
 [8, 14, 17],
 [1],
 