In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

## 1.数据提取&处理

In [11]:
train_data_file = './data/train.csv'
test_data_file = './data/test.csv'
train_data = pd.read_csv(train_data_file)
test_data = pd.read_csv(test_data_file)

In [12]:
train = train_data   # 备份数据

In [13]:
# 用户的平均评分
Mean = train.groupby('uid',as_index=False)['score'].mean()
train = pd.merge(train,Mean,on='uid')
# 用户的评分 - 其平均分：用户平衡不同用户打分喜好
train['adg_score'] = train['score_x'] - train['score_y']
# 更改列名
train.columns = ['uid','iid','score','time','mean_score','adg_score']

In [16]:
train.sample(n=5)

Unnamed: 0,uid,iid,score,time,mean_score,adg_score
34926,51,868,2,16,3.540984,-1.540984
99192,176,2035,4,1199,3.490741,0.509259
57129,97,9686,4,971,2.96181,1.03819
48059,83,1498,4,13,4.008403,-0.008403
95676,169,8550,2,1303,2.884584,-0.884584


## 2.数据处理

In [17]:
# 做数据透视，纵列为用户id，横列为商品id，表示用户与商品的交互，值为adg_score
final = pd.pivot_table(train,values='adg_score',index='uid',columns='iid')
final.head()

iid,0,1,2,3,5,6,7,8,9,10,...,14410,14437,14445,14454,14461,14514,14537,14556,14560,14709
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.867857,,,,,,,1.132143,,,...,,,,,,,,,,
1,,0.980392,,,,,,,,,...,,,,,,,,,,
2,,,0.359477,,,,,,,,...,,,,,,,,,,
3,,,-2.383408,,,,,,,,...,,,,,,,,,,
4,,,-1.054514,,,,,-1.054514,-1.054514,,...,,,,,,,,,,


In [57]:
# 对于其他用户没有交互的商品的评分采用整体的平均（商品、用户）简单处理
final_movie = final.fillna(final.mean(axis=0))   # 用户在行上的平均值
final_user = final.apply(lambda row: row.fillna(row.mean()), axis=1)   # 用户在列上的平均值

In [67]:
# 余弦相似度计算
# user similarity on replacing NAN by item(movie) avg
cosine1 = cosine_similarity(final_movie)
np.fill_diagonal(cosine1, 0)
similarity_with_movie = pd.DataFrame(cosine1, index=final_movie.index)
similarity_with_movie.columns = final_movie.index

# user similarity on replacing NAN by user avg
cosine2 = cosine_similarity(final_user)
np.fill_diagonal(cosine2, 0 )
similarity_with_user = pd.DataFrame(cosine2,index=final_user.index)
similarity_with_user.columns=final_user.index

In [69]:
similarity_with_user

uid,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000000,0.011713,0.029777,0.080816,-0.027263,0.079243,0.008387,-0.038534,0.039418,0.041351,...,0.109836,0.012404,0.033222,0.017901,0.042847,0.068592,0.022900,0.001582,0.018693,0.037317
1,0.011713,0.000000,-0.003631,-0.023649,0.044672,-0.002073,0.050443,-0.018509,0.033909,0.025426,...,0.033365,-0.010599,0.033200,-0.006696,-0.011527,0.006666,0.037662,0.002175,0.019993,0.007045
2,0.029777,-0.003631,0.000000,0.001049,0.063194,0.036072,0.110278,0.024360,0.026625,0.084658,...,-0.014687,-0.006120,0.064099,0.013963,0.035046,-0.033565,0.065656,0.026158,0.038996,-0.032495
3,0.080816,-0.023649,0.001049,0.000000,-0.003091,0.033521,0.040278,-0.003245,-0.026225,0.071727,...,0.186634,0.029626,0.056189,0.041483,0.013343,0.107541,0.023092,0.009935,0.048713,0.072807
4,-0.027263,0.044672,0.063194,-0.003091,0.000000,0.003940,0.093863,0.095673,0.071291,0.075405,...,0.005872,-0.038884,0.055699,0.030208,0.011240,-0.014824,-0.012448,0.018618,0.050475,-0.026133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,0.068592,0.006666,-0.033565,0.107541,-0.014824,0.050781,-0.025625,-0.019083,-0.014125,0.018468,...,0.103463,0.060995,0.041462,0.050963,0.045557,0.000000,0.047892,-0.002656,0.034219,0.064557
190,0.022900,0.037662,0.065656,0.023092,-0.012448,0.025414,-0.023948,-0.013328,-0.025716,0.061688,...,0.046684,0.026267,0.108570,0.011026,0.054791,0.047892,0.000000,0.039338,0.077027,0.049079
191,0.001582,0.002175,0.026158,0.009935,0.018618,0.007881,0.020018,0.033875,0.006785,0.020279,...,0.027511,0.012269,0.039713,0.050065,0.037058,-0.002656,0.039338,0.000000,0.078165,-0.005652
192,0.018693,0.019993,0.038996,0.048713,0.050475,-0.012482,0.038892,0.048243,0.034736,0.057872,...,0.009447,0.023969,0.087689,0.039338,0.011993,0.034219,0.077027,0.078165,0.000000,0.033159


In [72]:
# 检验相似度是否有效
def get_user_similar_movies(user1, user2 ):
    common_movies = train[train.uid == user1].merge(
    train[train.uid == user2],
    on = "iid",
    how = "inner" )
    return common_movies

a = get_user_similar_movies(0,1)
a[['iid','uid_x','score_x','mean_score_x','uid_y','score_y','mean_score_y']].head()

Unnamed: 0,iid,uid_x,score_x,mean_score_x,uid_y,score_y,mean_score_y
0,735,0,4,2.867857,1,3,3.019608
1,886,0,3,2.867857,1,4,3.019608
2,1140,0,1,2.867857,1,3,3.019608
3,1340,0,4,2.867857,1,3,3.019608
4,1353,0,2,2.867857,1,2,3.019608


In [81]:
similarity_with_user

uid,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000000,0.011713,0.029777,0.080816,-0.027263,0.079243,0.008387,-0.038534,0.039418,0.041351,...,0.109836,0.012404,0.033222,0.017901,0.042847,0.068592,0.022900,0.001582,0.018693,0.037317
1,0.011713,0.000000,-0.003631,-0.023649,0.044672,-0.002073,0.050443,-0.018509,0.033909,0.025426,...,0.033365,-0.010599,0.033200,-0.006696,-0.011527,0.006666,0.037662,0.002175,0.019993,0.007045
2,0.029777,-0.003631,0.000000,0.001049,0.063194,0.036072,0.110278,0.024360,0.026625,0.084658,...,-0.014687,-0.006120,0.064099,0.013963,0.035046,-0.033565,0.065656,0.026158,0.038996,-0.032495
3,0.080816,-0.023649,0.001049,0.000000,-0.003091,0.033521,0.040278,-0.003245,-0.026225,0.071727,...,0.186634,0.029626,0.056189,0.041483,0.013343,0.107541,0.023092,0.009935,0.048713,0.072807
4,-0.027263,0.044672,0.063194,-0.003091,0.000000,0.003940,0.093863,0.095673,0.071291,0.075405,...,0.005872,-0.038884,0.055699,0.030208,0.011240,-0.014824,-0.012448,0.018618,0.050475,-0.026133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,0.068592,0.006666,-0.033565,0.107541,-0.014824,0.050781,-0.025625,-0.019083,-0.014125,0.018468,...,0.103463,0.060995,0.041462,0.050963,0.045557,0.000000,0.047892,-0.002656,0.034219,0.064557
190,0.022900,0.037662,0.065656,0.023092,-0.012448,0.025414,-0.023948,-0.013328,-0.025716,0.061688,...,0.046684,0.026267,0.108570,0.011026,0.054791,0.047892,0.000000,0.039338,0.077027,0.049079
191,0.001582,0.002175,0.026158,0.009935,0.018618,0.007881,0.020018,0.033875,0.006785,0.020279,...,0.027511,0.012269,0.039713,0.050065,0.037058,-0.002656,0.039338,0.000000,0.078165,-0.005652
192,0.018693,0.019993,0.038996,0.048713,0.050475,-0.012482,0.038892,0.048243,0.034736,0.057872,...,0.009447,0.023969,0.087689,0.039338,0.011993,0.034219,0.077027,0.078165,0.000000,0.033159


In [115]:
# 采用相邻用户的思路，对于特定用户，只取K个类似用户的集合。
def find_n_neighbours(df,n):
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:n].index,index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

sim_user_30_m = find_n_neighbours(similarity_with_user,190)
sim_user_30_m.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top181,top182,top183,top184,top185,top186,top187,top188,top189,top190
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,128,22,178,46,123,156,149,175,60,94,...,140,10,4,96,109,7,72,125,176,77
1,86,135,54,142,139,144,102,171,50,106,...,7,65,84,23,3,53,182,130,83,13
2,105,152,6,162,67,155,135,87,140,145,...,163,45,18,83,169,149,63,74,193,189
3,184,123,46,122,162,178,183,156,108,25,...,65,137,166,15,134,59,96,125,171,77
4,113,145,87,155,152,10,27,173,144,59,...,43,23,146,0,178,123,185,182,183,35


In [126]:
# 用similarity_with_movie计算得分
def User_item_score(user,item):
    a = sim_user_30_m[sim_user_30_m.index==user].values
    b = a.squeeze().tolist()
    c = final_movie.loc[:,item]
    d = c[c.index.isin(b)]
    f = d[d.notnull()]
    avg_user = Mean.loc[Mean['uid'] == user,'score'].values[0]
    index = f.index.values.squeeze().tolist()
    corr = similarity_with_movie.loc[user,index]
    fin = pd.concat([f, corr], axis=1)
    fin.columns = ['adg_score','correlation']
    fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
    nume = fin['score'].sum()
    deno = fin['correlation'].sum()
    final_score = avg_user + (nume/deno)
    return final_score

score = User_item_score(0,3)
print("user:{} to item:{} score is {}".format(0,2,score))

user:0 to item:2 score is 3.3885106620155256


In [124]:
# 用similarity_with_user计算得分
def User_item_score(user,item):
    # 提取user的相似用户
    a = sim_user_30_m[sim_user_30_m.index==user].values
    b = a.squeeze().tolist()
    # 提取商品对应的评分
    c = final_user.loc[:,item]
    # 提取相似用户中是否有对商品的评分
    d = c[c.index.isin(b)]
    f = d[d.notnull()]
    avg_user = Mean.loc[Mean['uid'] == user,'score'].values[0]
    index = f.index.values.squeeze().tolist()
    corr = similarity_with_user.loc[user,index]
    fin = pd.concat([f, corr], axis=1)
    fin.columns = ['adg_score','correlation']
    fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
    nume = fin['score'].sum()
    deno = fin['correlation'].sum()
    final_score = avg_user + (nume/deno)
    return final_score

score = User_item_score(0,3)
print("user:{} to item:{} score is {}".format(0,2,score))

user:0 to item:2 score is 3.0324706988576433


In [127]:
user = train['uid'].unique().tolist()
item = train['iid'].unique().tolist()

## 3.生成提交文件

In [128]:
def User_item_score(user,item):
    # 提取user的相似用户
    a = sim_user_30_m[sim_user_30_m.index==user].values
    b = a.squeeze().tolist()
    # 提取商品对应的评分
    c = final_user.loc[:,item]
    # 提取相似用户中是否有对商品的评分
    d = c[c.index.isin(b)]
    f = d[d.notnull()]
    avg_user = Mean.loc[Mean['uid'] == user,'score'].values[0]
    index = f.index.values.squeeze().tolist()
    corr = similarity_with_user.loc[user,index]
    fin = pd.concat([f, corr], axis=1)
    fin.columns = ['adg_score','correlation']
    fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
    nume = fin['score'].sum()
    deno = fin['correlation'].sum()
    final_score = avg_user + (nume/deno)
    return final_score

def func(x):
    if x['iid'] in item and x['uid'] in user:
        score = User_item_score(x['uid'],x['iid'])
    else:
        score = 0
    return score

test_data['predict_score'] = test_data.apply(func, axis=1)

In [128]:
test_data.head()

Unnamed: 0,uid,iid,predict_score
0,0,12960,3.250776
1,1,12726,3.373618
2,1,11463,2.864437
3,1,10739,2.565086
4,1,3441,2.870126


In [131]:
test_data.columns = ['uid','iid','score']

In [132]:
test_data.to_csv('./submit_20210905_23.csv',index=False)