In [1]:
import pandas as pd
import multiprocessing
import deepctr
from deepctr.models import WDL
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
# data load
data = pd.read_csv('../ratings.csv')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [3]:
# data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1048575 non-null  int64  
 1   movieId    1048575 non-null  int64  
 2   rating     1048575 non-null  float64
 3   timestamp  1048575 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 32.0 MB


In [4]:
# features
sparse_features = ['userId', 'movieId', 'timestamp']
dense_features = []
target = ['rating']

# generate feature columns
sparse_feature_columns = [SparseFeat(name, vocabulary_size=data[name].nunique(), embedding_dim=4) for name in sparse_features]
dense_feature_columns = [DenseFeat(name, dimension=4) for name in dense_features]

linear_feature_columns = sparse_feature_columns + dense_feature_columns
deep_feature_columns = sparse_feature_columns + dense_feature_columns

fixlen_feature_columns = linear_feature_columns + deep_feature_columns

feature_names = get_feature_names(fixlen_feature_columns)

In [5]:
feature_names

['userId', 'movieId', 'timestamp']

In [6]:
# data normalize
# sparse data
for name in sparse_features:
    lbe = LabelEncoder()
    data[name] = lbe.fit_transform(data[name])

# dense data
# ss = StandardScaler()
# mms = MinMaxScaler()
# data[dense_features] = ss.fit_transform(data[dense_features])

In [7]:
# generate model input
x_train, x_test, y_train, y_test = train_test_split(data[feature_names], data[target], test_size=0.2)

train_model_input  = {name: x_train[name].values for name in feature_names}
test_model_input = {name: x_test[name].values for name in feature_names}

In [8]:
# model
WD = WDL(linear_feature_columns=linear_feature_columns, 
         dnn_feature_columns=deep_feature_columns,
         dnn_hidden_units=(128,128),
         seed=2020,
         dnn_dropout=0.2,
         dnn_activation='relu',
         task='regression')                               # l2正则项用默认值

WD.compile(optimizer='adam', loss='mse', metrics=['mse'])

In [15]:
history = WD.fit(train_model_input, y_train, batch_size=256, epochs=1, verbose=True, validation_split=0.2, workers=multiprocessing.cpu_count(), )



In [16]:
prediction = WD.predict(test_model_input)

In [17]:
mse = round((mean_squared_error(y_test, prediction)),4)
rmse = mse ** 0.5
print('RMSE:',rmse)

RMSE: 0.8879752248796134
