# 商品冷启动：为老用户推荐新商品

## 实际思路
1. 计算新物品与老物品在特征上的相似程度
2. 找到用户喜欢的老物品
3. 找到与老物品相似的新物品，推荐给用户
4. 精度和召回：用户对新商品打分了之后，比较为用户推荐的新商品和用户对新商品的喜好程度

## 代码思路
1. 特征矩阵
2. 相似度矩阵
3. 为用户推荐新商品
4. 精度召回：用户对新商品打分了之后，比较为用户推荐的新商品和用户对新商品的喜好程度

In [2]:
import numpy as np
import pandas as pd

#### 读取原商品

In [3]:
df_movie_old = pd.read_csv('./data/movie_old.csv',usecols=[0,2])
df_movie_old.head()

Unnamed: 0,movie_id,genres
0,4,Comedy|Drama|Romance
1,8,Adventure|Children
2,9,Action
3,12,Comedy|Horror
4,13,Adventure|Animation|Children


### 统计原物品中所有的特征

In [4]:
# 新建一个set保存特征
total_genres = set()
# 遍历每一行的特征
for genres in df_movie_old['genres']:   
    # | 集合取并
    total_genres |= set(genres.split('|'))
# 集合转列表
total_genres = list(total_genres)
total_genres

['Sci-Fi',
 'Crime',
 'Action',
 'Adventure',
 'Western',
 'Thriller',
 'Animation',
 'Children',
 'IMAX',
 'Drama',
 'Romance',
 'Film-Noir',
 'Fantasy',
 'Mystery',
 'War',
 'Horror',
 'Comedy',
 'Documentary',
 'Musical']

In [5]:
len(total_genres)

19

#### 读取新物品

In [6]:
df_movie_new = pd.read_csv('./data/movie_new.csv',usecols=[0,2])
df_movie_new.head()

Unnamed: 0,movie_id,genres
0,14,Drama
1,18,Comedy
2,30,Crime|Drama
3,35,Drama|Romance
4,41,Drama|War


### 建立原物品和新物品的特征矩阵

In [7]:
# 原物品id-->index
movie_old_id_to_index_dict = {}
# 新物品id-->index
movie_new_id_to_index_dict = {}

# 初始化原物品和新物品的特征矩阵
movie_old_genres_array = np.zeros(
    shape=(len(df_movie_old),len(total_genres))
)
movie_new_genres_array = np.zeros(
    shape=(len(df_movie_new),len(total_genres))
)

#### 建立旧物品的特征矩阵
# 遍历每一个旧物品
for index in range(len(df_movie_old)):        
    # 根据索引取到movie_id
    movie_id = df_movie_old.iloc[index]['movie_id']
    # 保存{ movie_id ：index } 键值对
    movie_old_id_to_index_dict[movie_id] = index
    # 取每一个旧物品的所有特征
    genres = df_movie_old.iloc[index]['genres'].split('|')
    
    # 创建特征行向量
    line_data = np.zeros(shape=len(total_genres))
    for i in range(len(total_genres)):
        # 如果旧物品有某一特征
        if total_genres[i] in genres:
            # 索引列取1，表示旧物品特征矩阵中，该物品有该特征
            line_data[i] = 1
    # 该旧物品有的所有特征行向量，赋值给index行
    movie_old_genres_array[index] = line_data

    
#### 建立新物品的特征矩阵
# 遍历每一个新物品
for index in range(len(df_movie_new)):        
    # 根据索引取到movie_id
    movie_id = df_movie_new.iloc[index]['movie_id']
    # 保存{ movie_id ：index } 键值对
    movie_new_id_to_index_dict[movie_id] = index
    # 取每一个新物品的所有特征
    genres = df_movie_new.iloc[index]['genres'].split('|')
    
    # 创建特征行向量
    line_data = np.zeros(shape=len(total_genres))
    for i in range(len(total_genres)):
        # 如果新物品有某一特征
        if total_genres[i] in genres:
            # 索引列取1，表示新物品特征矩阵中，该物品有该特征
            line_data[i] = 1
    # 该新物品有的所有特征行向量，赋值给index行
    movie_new_genres_array[index] = line_data
    

### 原物品到新物品的相似度矩阵

In [8]:
# array 2610个旧物品，每个旧物品有多少个特征，和就为多少
movie_old_genres_column_sum_array = np.sum(movie_old_genres_array,axis=1)
# array 856个新物品，每个新物品有多少个特征，和就为多少
movie_new_genres_column_sum_array = np.sum(movie_new_genres_array,axis=1)

In [9]:
movie_old_genres_column_sum_array.shape

(2610,)

In [10]:
# 初始化相似度矩阵，行为旧物品数，列为新物品数
movie_sim_array = np.zeros(
    shape=(len(df_movie_old),len(df_movie_new))
)
for index in range(len(df_movie_old)):
    # 一个旧物品特征向量，与其他所有新物品的特征向量矩阵相乘
    # 特征数为19，新物品数为856
    # v1.shape = 1 x 856
    v1 = np.dot(
        # dot ( 1 x 19，(856 x 19)的矩阵转置 ) = 1 x 856
        movie_old_genres_array[index],movie_new_genres_array.T
    )
    
    # 一个物品特征向量和，与其他所有新物品的特征向量和构成的列表相乘
    # np.sqrt对每个元素开方
    # v2.shape = 1 x 856
    v2 = np.around(np.sqrt(
        movie_old_genres_column_sum_array[index] * movie_new_genres_column_sum_array),3)
    
    # v1 / v2对应的每个元素相除，v1 / v2 = 1 x 856
    movie_sim_array[index] = np.around(v1 / v2,2)

In [11]:
movie_sim_array

array([[0.58, 0.58, 0.41, ..., 0.41, 0.58, 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.71, 0.  , 0.  ],
       ...,
       [0.71, 0.  , 0.5 , ..., 0.5 , 0.71, 0.  ],
       [0.  , 0.  , 0.  , ..., 0.41, 0.  , 0.58],
       [0.71, 0.  , 0.5 , ..., 0.5 , 0.71, 0.  ]])

#### 导入原物品的打分

In [12]:
df_rating_old = pd.read_csv('./data/rating_old.csv')
df_rating_old.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1009,3.5
1,1,1243,3.0
2,1,1848,3.5
3,1,1920,3.5
4,1,2118,4.0


#### 变更id为index

In [13]:
df_rating_old['movie_id'] = df_rating_old['movie_id'].apply(lambda movie_id : movie_old_id_to_index_dict[movie_id])
df_rating_old.head()

Unnamed: 0,user_id,movie_id,rating
0,1,319,3.5
1,1,370,3.0
2,1,518,3.5
3,1,538,3.5
4,1,623,4.0


In [14]:
df_rating_old.columns = ['user_id','movie_index','rating']

### 根据用户喜欢的原物品，生成新物品的推荐

In [15]:
user_recommend = {}

for index,(user_id,groupby_userid) in enumerate(df_rating_old.groupby('user_id')):
    # 1.一个用户对物品的打分取均值并降序排序
    movies_rating = groupby_userid.groupby('movie_index')['rating'].mean().sort_values(ascending=False)
    # 2.取评分大等于4分的商品的索引
    user_fav = movies_rating[
        movies_rating >= 4
    ].index.tolist()
    
    # 3.取新旧物品相似度 >= 0.85 的新物品
    #   np.where返回一个稀疏矩阵二元组（两个列表：旧物品索引、新物品索引）
    #   [1]取到新物品的索引
    #   [:100]最多为用户推荐100个商品
    #   set用于去重
    user_recommend[user_id] = set(np.where(
        movie_sim_array[user_fav] >= 0.85)[1].tolist()[:100])
    # 每100次，打印执行用户索引
    if index % 100 == 0:print(index,end='..')
        

0..100..200..300..400..500..600..700..800..900..1000..1100..1200..1300..1400..1500..1600..1700..1800..1900..2000..2100..2200..2300..2400..2500..2600..2700..2800..2900..3000..3100..3200..3300..3400..3500..3600..3700..3800..3900..4000..4100..4200..4300..4400..4500..4600..4700..4800..4900..5000..5100..5200..5300..5400..5500..5600..5700..5800..5900..6000..6100..6200..6300..6400..6500..6600..6700..6800..6900..7000..7100..7200..7300..7400..7500..7600..7700..7800..7900..8000..8100..8200..8300..8400..8500..8600..8700..8800..8900..9000..9100..9200..9300..9400..9500..9600..9700..9800..9900..10000..10100..10200..10300..10400..10500..10600..10700..10800..10900..11000..11100..11200..11300..11400..11500..11600..11700..11800..11900..12000..12100..12200..12300..12400..12500..12600..12700..12800..12900..13000..13100..13200..13300..13400..13500..13600..13700..13800..13900..14000..14100..14200..14300..14400..14500..14600..14700..14800..14900..15000..15100..15200..15300..15400..15500..15600..15700..15800.

### 取用户对新物品的实际打分

In [16]:
df_rating_new = pd.read_csv('./data/rating_new.csv')
df_rating_new.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1217,3.5
1,1,1348,3.5
2,1,1350,3.5
3,1,2138,4.0
4,1,2143,4.0


#### id变index

In [17]:
df_rating_new['movie_id'] = df_rating_new['movie_id'].apply(lambda movie_id : movie_new_id_to_index_dict[movie_id])
df_rating_new.columns = ['user_id','movie_index','rating']
df_rating_new.head()

Unnamed: 0,user_id,movie_index,rating
0,1,148,3.5
1,1,168,3.5
2,1,169,3.5
3,1,259,4.0
4,1,261,4.0


### 得到用户真正喜欢的新物品

In [18]:
user_fav = {}

for index,(user_id,groupby_userid) in enumerate(df_rating_new.groupby('user_id')):
    movies_rating = groupby_userid.groupby('movie_index')['rating'].mean().sort_values(ascending=False)
    movie_indexs = set(movies_rating[
        movies_rating >= 3
    ].index.tolist())
    user_fav[user_id] = movie_indexs
    if index % 100 == 0:print(index,end='..')

0..100..200..300..400..500..600..700..800..900..1000..1100..1200..1300..1400..1500..1600..1700..1800..1900..2000..2100..2200..2300..2400..2500..2600..2700..2800..2900..3000..3100..3200..3300..3400..3500..3600..3700..3800..3900..4000..4100..4200..4300..4400..4500..4600..4700..4800..4900..5000..5100..5200..5300..5400..5500..5600..5700..5800..5900..6000..6100..6200..6300..6400..6500..6600..6700..6800..6900..7000..7100..7200..7300..7400..7500..7600..7700..7800..7900..8000..8100..8200..8300..8400..8500..8600..8700..8800..8900..9000..9100..9200..9300..9400..9500..9600..9700..9800..9900..10000..10100..10200..10300..10400..10500..10600..10700..10800..10900..11000..11100..11200..11300..11400..11500..11600..11700..11800..11900..12000..12100..12200..12300..12400..12500..12600..12700..12800..12900..13000..13100..13200..13300..13400..13500..13600..13700..13800..13900..14000..14100..14200..14300..14400..14500..14600..14700..14800..14900..15000..15100..15200..15300..15400..15500..15600..15700..15800.

### 比较为用户推荐的新商品和用户对新商品的打分
#### 计算准确率和召回率

In [19]:
union_quantity = 0      # 为用户推荐的，用户也喜欢的个数   
recommend_quantity = 0  # 为用户推荐的商品总数
fav_quantity = 0        # 用户喜欢的商品总数


for user_id in user_recommend.keys():
    if user_id in user_fav.keys():
        union_quantity += len(
            user_recommend[user_id] & user_fav[user_id]
        )
        recommend_quantity += len(user_recommend[user_id])
        fav_quantity += len(user_fav[user_id])

print('准确率',union_quantity / recommend_quantity)
print('召回率',union_quantity / fav_quantity)

准确率 0.0721781590351196
召回率 0.16878049831774863
