In [5]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat,get_feature_names

In [7]:
#数据加载
data = pd.read_csv("movielens_sample.txt")
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']
print(data)

     user_id  movie_id  rating  timestamp  \
0       3299       235       4  968035345   
1       3630      3256       3  966536874   
2        517       105       4  976203603   
3        785      2115       3  975430389   
4       5848       909       5  957782527   
..       ...       ...     ...        ...   
195     1427      3596       3  974840560   
196     3868      1626       3  965855033   
197      249      2369       3  976730191   
198     5720       349       4  958503395   
199      877      1485       3  975270899   

                                           title                     genres  \
0                                 Ed Wood (1994)               Comedy|Drama   
1                           Patriot Games (1992)            Action|Thriller   
2          Bridges of Madison County, The (1995)              Drama|Romance   
3    Indiana Jones and the Temple of Doom (1984)           Action|Adventure   
4                          Apartment, The (1960)               C

In [9]:
# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
print(data)

     user_id  movie_id  rating  timestamp  \
0        107        12       4  968035345   
1        123       169       3  966536874   
2         12         6       4  976203603   
3         21       112       3  975430389   
4        187        45       5  957782527   
..       ...       ...     ...        ...   
195       46       176       3  974840560   
196      131        89       3  965855033   
197        4       125       3  976730191   
198      181        15       4  958503395   
199       25        86       3  975270899   

                                           title                     genres  \
0                                 Ed Wood (1994)               Comedy|Drama   
1                           Patriot Games (1992)            Action|Thriller   
2          Bridges of Madison County, The (1995)              Drama|Romance   
3    Indiana Jones and the Temple of Doom (1984)           Action|Adventure   
4                          Apartment, The (1960)               C

In [32]:
# 计算每个特征中的 不同特征值的个数???????/不太懂
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
#x=SparseFeat(sparse_features[0], data[sparse_features[0]].nunique())
#print(x)
#print(fixlen_feature_columns)
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
print(feature_names)

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']


In [33]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}
print(train_model_input)

{'movie_id': array([143,  17,  39, 103,  49,  98, 161, 136,  90,  14, 168,  86,  89,
       113,  53,  32, 112,  79, 165, 184,  40,   9,  43, 145, 156,  72,
         6,  38,  19,  48,  75,  45,  35,   4,   7, 101, 108, 120, 125,
        28,  84, 110, 107,  92,  78, 173,  81,   1,  10, 154,  58, 131,
        82, 116,  62, 122, 147,  44, 118,  66, 159, 111, 109, 134,  11,
       105, 155, 106,  80,  25,  55, 166,  18, 167,  56,  30, 151,  65,
       177, 169,  12, 176,  33, 135,  77,  99,  69, 182, 141,  51, 170,
       115,  31,  88, 169, 164,  52, 104, 142, 152, 102, 173, 186, 117,
        46, 178,   9, 146,   5,  97,  60, 170, 137, 126, 144,  72, 179,
        34,   8, 183,  66,  41, 172, 130,  94, 149,  87,  91, 100,   2,
        29,  57,  85, 124, 129, 148,  27, 180,  15,  20, 171,  95, 150,
       181,  21,  96, 126,  23,  13,  27,  63, 112,   3,  76,  67, 114,
       128,  73,  61, 153], dtype=int64), 'user_id': array([114, 165,   3,  18,  76,  35,  89,  66,  60,  23, 192,  25, 131

In [28]:
# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2 )





Train on 128 samples, validate on 32 samples


In [34]:
# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

test RMSE 3.6968229603268803
