# 1. 先来看看数据

训练数据基于 `MovieLens 1M`

## 1.1 电影数据

先来看下电影数据：

In [1]:
import pandas as pd

movie_header = ['电影id', '电影名', '电影类型']
movie_data = pd.read_csv('./ml-1m/movies.dat', delimiter='::', header=None, names=movie_header, engine='python')
movie_data.head()

Unnamed: 0,电影id,电影名,电影类型
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
print('=========================================')
print('电影详情')
print('=========================================')
print('共有{}部电影'.format(len(movie_data)))
print('- - - - - - - - - - - - - - - - - - - - -')
def getAllMovieTypes(data):
    types = set([])
    for tps in data.iloc[:,2]:
        types.update(tps.split('|'))
    return list(types)
movie_types = getAllMovieTypes(movie_data)
print('共有{}种电影类型:'.format(len(movie_types)))
print(movie_types)

电影详情
共有3883部电影
- - - - - - - - - - - - - - - - - - - - -
共有18种电影类型:
['Action', 'Comedy', 'Horror', 'Mystery', "Children's", 'Sci-Fi', 'Film-Noir', 'Crime', 'Animation', 'Adventure', 'Romance', 'Thriller', 'War', 'Musical', 'Fantasy', 'Drama', 'Documentary', 'Western']


## 1.2 用户数据

再来看下用户数据

In [3]:
user_header = ['用户id', '性别', '年龄', '职业', '邮编']
user_data = pd.read_csv('./ml-1m/users.dat', delimiter='::', header=None, names=user_header, engine='python')
print('共{}个用户'.format(len(user_data)))
user_data.head()

共6040个用户


Unnamed: 0,用户id,性别,年龄,职业,邮编
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


其中：
- 性别：F(Female)表示女性, M(Male)表示男性
- 年龄(6个分段)：
    - 1："18岁以下"
    - 18: "18-24岁"
    - 25: "25-34岁"
    - 35: "35-44岁"
    - 45: "45-49岁"
    - 50: "50-55岁"
    - 56: "56岁以上"
- 职业(21种职业):
    - 0: "other" or not specified
    - 1: "academic/educator"
    - 2: "artist"
    - 3: "clerical/admin"
    - 4: "college/grad student"
    - 5: "customer service"
    - 6: "doctor/health care"
    - 7: "executive/managerial"
    - 8: "farmer"
    - 9: "homemaker"
    - 10: "K-12 student"
    - 11: "lawyer"
    - 12: "programmer"
    - 13: "retired"
    - 14: "sales/marketing"
    - 15: "scientist"
    - 16: "self-employed"
    - 17: "technician/engineer"
    - 18: "tradesman/craftsman"
    - 19: "unemployed"
    - 20: "writer"
- 邮编：
    不关注

## 1.3 评分数据
最后看下评分数据

In [4]:
rating_header = ['用户id', '电影id', '评分', '时间戳']
rating_data = pd.read_csv('./ml-1m/ratings.dat', delimiter='::', header=None, names=rating_header, engine='python')
rating_data.head()

Unnamed: 0,用户id,电影id,评分,时间戳
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# 2. 数据清洗

## 2.1 一些变量

先定义一些变量，方便后续数据清洗使用:

In [5]:
def getMovieDetailDict(data):
    '''获取movie详情字典.
    
    格式：
    {
        movie_id: {      # 电影id, int
           movie_title,  # 电影名, string
           movie_types   # 电影类型,list
        }
    }
    
    '''
    ret = {}
    for index, row in data.iterrows():
        ret[row['电影id']] = {
            "movie_title": row['电影名'][:-7],  # 忽略年份
            "movie_types": row['电影类型'].split('|')
        }
    
    return ret


def getUserDetailDict(data):
    '''获取用户详情字典.
    
    格式:
    {
        user_id: {
            gender,   # 性别, string
            age,      # 年龄段, int
            job,      # 职业, int
        }
    }
    '''
    ret = {}
    for index, row in data.iterrows():
        ret[row['用户id']] = {
            "gender": row['性别'],
            "age": row['年龄'],
            "job": row['职业']
        }
    
    return ret

def getMovieTitleDict(data):
    '''获取电影名词典和最长的电影名长度.
    
    将所有的电影的电影名看作一个词袋（word bag），例如： "love story"长度为2
    
    格式:
    {
        term: index
    }
    '''
    index = 1
    max_len = 0
    ret = {}
    for _, row in data.iterrows():
        title = row['电影名']
        terms = title.split(' ')
        max_len = max(max_len, len(terms))
        for term in terms:
            if term not in ret:
                ret[term]=index
                index+=1
            
    return ret, max_len

n_user       = max(user_data['用户id'])       # 用户数目（最大用户id）
n_movie      = max(movie_data['电影id'])      # 电影数目（最大电影id）

movie_detail_dict = getMovieDetailDict(movie_data) # 电影详情
user_detail_dict  = getUserDetailDict(user_data)   # 用户详情

user_age_sep    = [1, 18, 25, 35, 45, 50, 56]        # 用户年龄段
user_job_sep    = [i for i in range(21)]        # 用户工作类别
user_gender_sep = ['F', 'M']                  # 用户性别类别

movie_title_dict, title_max_len = getMovieTitleDict(movie_data)   # 电影名字典, 电影名最大长度

## 2.2 一些处理

为了使得数据便于训练，我们将做如下处理（one-hot编码）：
1. 将性别映射为0/1
2. 将用户6个年龄段映射为0-5
3. 用户工作刚好为0-20，无需处理
4. 将电影名映射为词袋索引
5. 将电影类型映射为类型索引（由于电影类型不止一种，所以是一个列表）

新的数据将输出为一个新的`pandas.DataFrame`。

清洗后的数据字段有：

|字段名|描述 |
|:----:|:----:|
|user_id|用户id |
|movie_id|电影id|
|user_gender|用户性别(0/1)|
|user_age|用户年龄段(0-6)|
|user_job|用户工作(0-20)|
|movie_title|电影标题(不定长,用0填充)|
|movie_types|电影类型(不定长,用0填充)|
|rank|用户对电影的评分|

In [6]:
def padding_list(lis, padding_size):
    'padding数据...'
    for _ in range(padding_size-len(lis)):
        lis.append(0)
    return lis

dataset_header = ['user_id', 'movie_id', 'user_gender', 'user_age', 'user_job', 'movie_title', 'movie_types', 'rank']

user_id_series     = rating_data['用户id']
movie_id_series    = rating_data['电影id']
user_gender_series = user_id_series.map(lambda user_id: user_gender_sep.index(user_detail_dict[user_id]['gender']))
user_age_series    = user_id_series.map(lambda user_id: user_age_sep.index(user_detail_dict[user_id]['age']))
user_job_series    = user_id_series.map(lambda user_id: user_detail_dict[user_id]['job'])
movie_title_series = movie_id_series.map(lambda movie_id: padding_list([movie_title_dict[term] for term in movie_detail_dict[movie_id]['movie_title'].split(' ')], padding_size=title_max_len))
movie_types_series = movie_id_series.map(lambda movie_id: padding_list([movie_types.index(tp)+1 for tp in movie_detail_dict[movie_id]['movie_types']], padding_size=len(movie_types)))
rank_series        = rating_data['评分']

# 创建数据集
dataset = pd.concat(
    [
        user_id_series.rename('user_id'),
        movie_id_series.rename('movie_id'),
        user_gender_series.rename('user_gender'),
        user_age_series.rename('user_age'),
        user_job_series.rename('user_job'),
        movie_title_series.rename('movie_title'),
        movie_types_series.rename('movie_types'),
        rank_series.rename('rank')
    ],
    axis=1
)

# 保存数据到本地
import pickle as pkl
pkl.dump(dataset, open('./data.p', 'wb'))
dataset.head(n=100)   

Unnamed: 0,user_id,movie_id,user_gender,user_age,user_job,movie_title,movie_types,rank
0,1,1193,0,0,10,"[1164, 2139, 719, 13, 2140, 2141, 0, 0, 0, 0, ...","[16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",5
1,1,661,0,0,10,"[1270, 20, 13, 1271, 1272, 0, 0, 0, 0, 0, 0, 0...","[9, 5, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",3
2,1,914,0,0,10,"[573, 168, 1538, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[14, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",3
3,1,3408,0,0,10,"[4724, 4725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",4
4,1,2355,0,0,10,"[3482, 1395, 460, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[9, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5
...,...,...,...,...,...,...,...,...
95,2,2490,1,6,16,"[3625, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",3
96,2,1834,1,6,16,"[2893, 2894, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[16, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",4
97,2,3471,1,6,16,"[330, 4778, 12, 13, 2171, 2086, 0, 0, 0, 0, 0,...","[16, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",5
98,2,589,1,6,16,"[1135, 155, 934, 1136, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 6, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",4


In [7]:
max(dataset['movie_title'])

[5290, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## 2.3 PyTorch数据加载模型构建

数据使pytorch的 Dataset 进行动态加载，为此需要构建 Dataset类：

In [8]:
from torch.utils.data import Dataset
import torch

class MovieRatingDataset(Dataset):
    
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        
        # user data
        uid    = self.df.iloc[idx, 0]
        gender = self.df.iloc[idx, 2]
        age    = self.df.iloc[idx, 3]
        job    = self.df.iloc[idx, 4]
        
        # movie data
        mid    = self.df.iloc[idx, 1]
        mtitle = self.df.iloc[idx, 5]
        mtype  = self.df.iloc[idx, 6]
        
        rank   = torch.FloatTensor([self.df.iloc[idx, 7]])
        
        user_input = {
            'uid':    torch.LongTensor([uid]),
            'gender': torch.LongTensor([gender]),
            'age':    torch.LongTensor([age]),
            'job':    torch.LongTensor([job]),
        }
        
        movie_input = {
            'mid':    torch.LongTensor([mid]),
            'mtitle': torch.LongTensor([mtitle]),
            'mtype':  torch.LongTensor([mtype]).view(-1),
        }
        
        sample = {
            'user_input':  user_input,
            'movie_input': movie_input,
            'target':      rank
        }
        
        return sample

# 3. 模型构建

模型如图所示：

<img src=".\model.001.jpeg" alt="1" style="zoom:70%;" />

总体分为两个通道（channel）：
- 用户通道
- 电影通道

用户通道采用 嵌入层+全连接层 的形式  
电影通道采用 嵌入层+全连接层+文本卷积网络的形式

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RecModel(nn.Module):
    def __init__(
        self,
        n_user,               # 用户数
        n_movie,              # 电影数
        n_age,                # 年龄段数
        n_job,                # 工作数
        n_movie_type,         # 电影类型数目
        n_movie_title_dict,   # 电影名字典大小
        n_movie_title_maxlen, # 最长的电影名长度
        cnn_kernel_size_list, # 文本卷积核窗口宽度列表（文本卷积核长度默认等于embeding_size）
        cnn_kernel_num,       # 文本卷积核个数
        device,               # 设备
        embed_dim=32, 
        fc_size=200
    ):
        super().__init__()
        
        self.fc_size = fc_size
        self.cnn_kernel_num = cnn_kernel_num
        self.embed_dim = embed_dim
        
        # ----------------------------------------- 用户通道 ---------------------------------------
        # user embedding
        self.embedding_uid    = nn.Embedding(num_embeddings=n_user, embedding_dim=embed_dim)
        self.embedding_gender = nn.Embedding(num_embeddings=2, embedding_dim=embed_dim//2)
        self.embedding_age    = nn.Embedding(num_embeddings=n_age, embedding_dim=embed_dim//2)
        self.embedding_job    = nn.Embedding(num_embeddings=n_job, embedding_dim=embed_dim//2)
        
        # user embed to fc: the first fully-connected layers
        self.fc_uid    = nn.Linear(in_features=embed_dim, out_features=embed_dim)
        self.fc_gender = nn.Linear(in_features=embed_dim//2, out_features=embed_dim)
        self.fc_age    = nn.Linear(in_features=embed_dim//2, out_features=embed_dim)
        self.fc_job    = nn.Linear(in_features=embed_dim//2, out_features=embed_dim)
        
        
        # concated embeddings to fc: the second fully-connected layer (n_batch,128) => (n_batch,200)
        self.fc_user_combined = nn.Linear(in_features=4*embed_dim, out_features=fc_size)
        
        # ------------------------------------------ 电影通道 --------------------------------------
        # movie embeddings
        self.embedding_mid   = nn.Embedding(num_embeddings=n_movie, embedding_dim=embed_dim)
        self.embedding_mtype = nn.EmbeddingBag(
                                    num_embeddings=n_movie_type,
                                    embedding_dim=embed_dim,
                                    padding_idx=0
                               )
        
        self.fc_mid   = nn.Linear(in_features=embed_dim, out_features=embed_dim)
        self.fc_mtype = nn.Linear(in_features=embed_dim, out_features=embed_dim)
        
        # (n_batch, 64) => (n_batch,200)
        self.fc_mid_mtype_combined = nn.Linear(in_features=embed_dim*2, out_features=fc_size)
        
        # text CNN: 文本卷积网络
        self.embedding_mtitle = nn.Embedding(num_embeddings=n_movie_title_dict, embedding_dim=embed_dim, padding_idx=0)
        
        # 电影名向量化后维度: (n_batch, n_movie_titile_maxlen, embed_dim)
        self.movie_title_CNN = [
            nn.Sequential(
                nn.Conv2d(in_channels=1, out_channels=cnn_kernel_num, kernel_size=(k, embed_dim)),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=(n_movie_title_maxlen-k+1, 1), stride=(1,1))
            ).to(device) for k in cnn_kernel_size_list
        ]
        
        # movie channel concat
        self.fc_movie_combine = nn.Linear(embed_dim*2 + cnn_kernel_num * len(cnn_kernel_size_list), fc_size)
        
        
        self.output_fc = nn.Linear(fc_size*2, 1)
        
    def forward(self, user_input, movie_input):
        # unzip train_data
        uid    = user_input['uid']
        gender = user_input['gender']
        age    = user_input['age']
        job    = user_input['job']
        
        mid    = movie_input['mid']
        mtitle = movie_input['mtitle']
        mtype  = movie_input['mtype']
        
        uid, gender, age, job, mid, mtitle, mtype = \
            uid.to(device), gender.to(device), age.to(device), job.to(device), mid.to(device), mtitle.to(device), mtype.to(device)
            
        
        # user channel forward
        feature_uid    = F.relu(self.fc_uid(self.embedding_uid(uid)))
        feature_gender = F.relu(self.fc_gender(self.embedding_gender(gender)))
        feature_age    = F.relu(self.fc_age(self.embedding_age(age)))
        feature_job    = F.relu(self.fc_job(self.embedding_job(job)))
        
        # feature user (n_batch, 1, 200)
        
        feature_user = torch.tanh(
            self.fc_user_combined(
                torch.cat([feature_uid, feature_gender, feature_age, feature_job], 2)
            )
        ).view(-1, 1, 200)
        
        
        # movie channel forward
        feature_mid   = F.relu(self.fc_mid(self.embedding_mid(mid)))
        feature_mtype = self.embedding_mtype(mtype)
        
        # text CNN forward 文本卷积网络正向传播
        feature_img = self.embedding_mtitle(mtitle)
        flatten_tensors = []
        for conv in self.movie_title_CNN:
            flatten_tensors.append(conv(feature_img).view(-1,1,self.cnn_kernel_num))
            
        feature_flattern_dropout = F.dropout(torch.cat(flatten_tensors, 2), p=0.5)  # n_batch x embed_dim
        
        # feature_movie n_batch x 1 x 200
        feature_movie = torch.tanh(
            self.fc_movie_combine(
                torch.cat([feature_mid.view(-1,1,self.embed_dim), feature_mtype.view(-1,1,self.embed_dim), feature_flattern_dropout], 2)
            )
        )
        
        # feature user/movie combined
        output = self.output_fc(torch.cat([feature_user, feature_movie], 2)).view(-1,1)
        return output, feature_user, feature_movie
        
        
        
        

# 4. 模型训练

## 4.1 数据加载器和模型定义

In [20]:
from torch.utils.data import DataLoader

datasets = MovieRatingDataset(df=dataset)
dataloader = DataLoader(datasets, batch_size=32, shuffle=True)

device = torch.device("cuda:0")

model = RecModel(
        n_user=n_user+1,          
        n_movie=n_movie+1,         
        n_age=7,           
        n_job=21,             
        n_movie_type=18+1,         
        n_movie_title_dict=len(movie_title_dict)+1,   
        n_movie_title_maxlen=16, 
        cnn_kernel_size_list={2,3,4,5}, 
        cnn_kernel_num=8, 
        device=device
).to(device)

## 4.2 训练

In [23]:
from tensorboardX import SummaryWriter
import torch.optim as optim


if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
    
learning_rate = 0.0001
n_epochs = 1

# 损失函数, 均方误差
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

losses = []
writer = SummaryWriter("log")

for epoch in range(n_epochs):
    loss_all = 0
    for i_batch, sample_batch in enumerate(dataloader):
        user_input  = sample_batch['user_input']
        movie_input = sample_batch['movie_input']
        target      = sample_batch['target'].to(device)
        model.zero_grad()
        
        tag_rank, _, _ = model(user_input, movie_input)
        loss = loss_function(tag_rank, target)
        if i_batch%20 == 0:
            writer.add_scalar('data/loss', loss, i_batch*20)
            print('loss: {}'.format(loss),end='\r')
        loss_all+=loss
        loss.backward()
        optimizer.step()
    print("Epoch {}:\t loss: {}".format(epoch, loss_all))

writer.export_scalars_to_json("./test.json")
writer.close()

Epoch 0:	 loss: 33757.4296875


In [129]:
device

device(type='cuda', index=0)

In [134]:
embeding = nn.Embedding(10, 3)
inputs = torch.LongTensor([[1,2,3], [4,5,6]])
embeding(inputs).shape

torch.Size([2, 3, 3])