In [30]:
import pandas as pd
import numpy as np
from ast import literal_eval
import os
from scipy.spatial import distance

In [31]:
user_action_path = "./wechat_algo_data1/user_action.csv"
feed_info_modified_path = "./feed_info_modified1.csv"
test_data_path = "./wechat_algo_data1/test_a.csv"
feed_info_path = "./wechat_algo_data1/feed_info.csv"
feed_embedding_path = "./wechat_algo_data1/feed_embeddings.csv"

In [32]:
feed_info_modified_df = pd.read_csv(feed_info_modified_path)
feed_info_df = pd.read_csv(feed_info_path)
feed_embedding_df = pd.read_csv(feed_embedding_path)

In [33]:
feed_info_modified_df.columns

Index(['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id',
       'bgm_singer_id', 'author_n_feeds', 'videolength_bucket',
       'videolength_log', 'top5_keywords', 'top5_keywords_weights', 'tag_list',
       'tag_weights', 'feed_embedding'],
      dtype='object')

In [34]:
feed_info_df.columns

Index(['feedid', 'authorid', 'videoplayseconds', 'description', 'ocr', 'asr',
       'bgm_song_id', 'bgm_singer_id', 'manual_keyword_list',
       'machine_keyword_list', 'manual_tag_list', 'machine_tag_list',
       'description_char', 'ocr_char', 'asr_char'],
      dtype='object')

In [35]:
feed_embedding_df.head()

Unnamed: 0,feedid,feed_embedding
0,46022,-0.02032269 0.06095614 0.11057708 0.03385210 0...
1,73903,-0.07594238 0.01796364 -0.00135112 -0.00333468...
2,88646,-0.05067272 -0.08208735 -0.01929738 -0.0150182...
3,24381,-0.06976026 0.00218324 0.04416835 0.06146711 -...
4,41542,-0.04981736 -0.03523079 0.03022859 0.06672543 ...


In [7]:
feed_info_modified_df.shape

(106444, 13)

In [36]:
print(feed_info_modified_df.columns)

Index(['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id',
       'bgm_singer_id', 'author_n_feeds', 'videolength_bucket',
       'videolength_log', 'top5_keywords', 'top5_keywords_weights', 'tag_list',
       'tag_weights', 'feed_embedding'],
      dtype='object')


In [37]:
feed_info_df.shape

(106444, 15)

In [38]:
feed_embedding_df.shape

(106444, 2)

In [39]:
feed_info_modified_df.drop(columns=['top5_keywords', 'tag_list', 'top5_keywords_weights', 'tag_weights'], inplace=True)

In [40]:
feed_info_modified_df[['machine_tag_list', 'manual_tag_list', 'machine_keyword_list', 'manual_keyword_list']] = feed_info_df[['machine_tag_list', 'manual_tag_list', 'machine_keyword_list', 'manual_keyword_list']]

In [41]:
# 关于标签，用户打的标签和机器打的标签是不一样的
# 机器打的标签有置信度的信息，可以只将置信度足够高的进行处理，而对于置信度太低的则不处理
# 只将置信度高于 0.5 的作为新增的标签，而置信度低于 0.5 的直接舍弃掉
def get_machine_tag_list(feed_info):
  original_tags = feed_info['machine_tag_list'].values
  topn_machine_tags = []
  for i in range(len(original_tags)):
    if(type(original_tags[i]) == str):
      tags = []
      tag_with_weights = original_tags[i].split(';')
      for tag_weight in tag_with_weights:
        tag = tag_weight.split(' ')[0]
        weight = float(tag_weight.split(' ')[1])
        if weight >= 0.5:
          tags.append(tag)
      topn_machine_tags.append(tags)
    else:
      topn_machine_tags.append([])
  return topn_machine_tags

In [42]:
valid_machine_tags = get_machine_tag_list(feed_info_modified_df)
feed_info_modified_df['machine_tag_list'] = valid_machine_tags
#print(feed_info_modifeed_df.columns)

In [43]:
feed_info_modified_df['manual_tag_list'] = feed_info_modified_df['manual_tag_list'].apply(lambda x: x.split(';') if type(x) == str else [])
feed_info_modified_df['machine_keyword_list'] = feed_info_modified_df['machine_keyword_list'].apply(lambda x: x.split(';') if type(x) == str else [])
feed_info_modified_df['manual_keyword_list'] = feed_info_modified_df['manual_keyword_list'].apply(lambda x: x.split(';') if type(x) == str else [])

In [44]:
feed_info_modified_df['tag_list'] = feed_info_modified_df['manual_tag_list'] + feed_info_modified_df['machine_tag_list']
feed_info_modified_df['keyword_list'] = feed_info_modified_df['machine_keyword_list'] + feed_info_modified_df['manual_keyword_list']

In [45]:
feed_info_modified_df.drop(columns=['machine_tag_list', 'manual_tag_list', 'machine_keyword_list', 'manual_keyword_list'], inplace=True)

In [47]:
#去重
feed_info_modified_df['tag_list'] = feed_info_modified_df['tag_list'].apply(lambda x: list(set(x)))
feed_info_modified_df['keyword_list'] = feed_info_modified_df['keyword_list'].apply(lambda x: list(set(x)))
print(feed_info_modified_df.columns)

Index(['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id',
       'bgm_singer_id', 'author_n_feeds', 'videolength_bucket',
       'videolength_log', 'feed_embedding', 'tag_list', 'keyword_list'],
      dtype='object')


In [52]:
from ast import literal_eval
import numpy as np
#feed_info_modified_df['feed_embedding'] = feed_info_modified_df['feed_embedding'].apply(lambda x: literal_eval(x))
print(type(feed_info_modified_df.loc[0,'feed_embedding'][0]))

<class 'float'>


In [53]:
feed_info_modified_df['feed_embedding'] = feed_info_modified_df['feed_embedding'].apply(lambda x: np.array(x, np.float32))

In [54]:
feed_embeddings = []

for embedding in feed_info_modified_df['feed_embedding'].values:
  feed_embeddings.append(embedding.tolist())

feed_embeddings = np.array(feed_embeddings, np.float32)

In [55]:
# 使用 PCA 对embeddig进行降维，降到 32 维试试？     
# 结果发现降到32维度，方差占比太低，已经没有多少有用信息了，不过直接作为 lgb 的输入还是由提升

In [56]:
from sklearn.preprocessing import StandardScaler

In [57]:
feed_embeddings_scaled = StandardScaler().fit_transform(feed_embeddings)

In [58]:
from sklearn.decomposition import PCA

In [59]:
#对embedding简答做一下降维之后作为新的特征试试
pca_model = PCA(n_components=32)

pca_model.fit(feed_embeddings_scaled)

PCA(copy=True, iterated_power='auto', n_components=32, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [60]:
feed_embeddings_scaled = pca_model.transform(feed_embeddings_scaled)

In [61]:
for i in range(32):
  feed_info_modified_df['embedding_' + str(i)] = feed_embeddings_scaled[:, i]

In [62]:
feed_info_modified_df.drop(columns=['videolength_log'], inplace=True)

In [96]:
# 开始对用户的历史行为数据进行挖掘
# 可以选择用户全量的历史行为数据，挖掘用户的兴趣
print(feed_info_modified_df.columns)

Index(['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id',
       'bgm_singer_id', 'author_n_feeds', 'videolength_bucket',
       'feed_embedding', 'tag_list', 'keyword_list', 'embedding_0',
       'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4',
       'embedding_5', 'embedding_6', 'embedding_7', 'embedding_8',
       'embedding_9', 'embedding_10', 'embedding_11', 'embedding_12',
       'embedding_13', 'embedding_14', 'embedding_15', 'embedding_16',
       'embedding_17', 'embedding_18', 'embedding_19', 'embedding_20',
       'embedding_21', 'embedding_22', 'embedding_23', 'embedding_24',
       'embedding_25', 'embedding_26', 'embedding_27', 'embedding_28',
       'embedding_29', 'embedding_30', 'embedding_31'],
      dtype='object')


In [97]:
user_action_df = pd.read_csv(user_action_path)
print(user_action_df.columns)

Index(['userid', 'feedid', 'date_', 'device', 'read_comment', 'comment',
       'like', 'play', 'stay', 'click_avatar', 'forward', 'follow',
       'favorite'],
      dtype='object')


In [98]:
# 查看是否是用户的行为都是完播的
LABEL_COLUMNS = ['click_avatar', 'forward', 'follow', 'favorite', 'read_comment', 'comment', 'like']
ACTION_LIST = ['click_avatar', 'forward', 'follow', 'favorite', 'read_comment', 'comment', 'like', 'is_stay', 'is_finished', 'feedback']
user_action_df['feedback'] = (user_action_df[LABEL_COLUMNS].sum(axis=1) > 0).astype(np.int)

user_action_df = pd.merge(user_action_df, feed_info_modified_df, on='feedid')
print(user_action_df.columns)

Index(['userid', 'feedid', 'date_', 'device', 'read_comment', 'comment',
       'like', 'play', 'stay', 'click_avatar', 'forward', 'follow', 'favorite',
       'feedback', 'authorid', 'videoplayseconds', 'bgm_song_id',
       'bgm_singer_id', 'author_n_feeds', 'videolength_bucket',
       'feed_embedding', 'tag_list', 'keyword_list', 'embedding_0',
       'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4',
       'embedding_5', 'embedding_6', 'embedding_7', 'embedding_8',
       'embedding_9', 'embedding_10', 'embedding_11', 'embedding_12',
       'embedding_13', 'embedding_14', 'embedding_15', 'embedding_16',
       'embedding_17', 'embedding_18', 'embedding_19', 'embedding_20',
       'embedding_21', 'embedding_22', 'embedding_23', 'embedding_24',
       'embedding_25', 'embedding_26', 'embedding_27', 'embedding_28',
       'embedding_29', 'embedding_30', 'embedding_31'],
      dtype='object')


In [99]:
user_action_df['play'] = user_action_df['play'] / 1000.0
user_action_df['stay'] = user_action_df['stay'] / 1000.0

In [100]:
user_action_df['is_finished'] = (user_action_df['play'] >= user_action_df['videoplayseconds'] * 0.9).astype(np.int)
user_action_df['is_stay'] = (user_action_df['stay'] - user_action_df['videoplayseconds'] >= 10).astype(np.int)

In [101]:
user_action_df['interested'] = (user_action_df[['favorite', 'read_comment', 'comment', 'like']].sum(axis=1) > 0).astype(np.int)

In [102]:
for column in LABEL_COLUMNS + ['interested']:
  user_column_df = user_action_df[user_action_df[column] == 1]
  print("用户行为：", column)
  print("行为数量：", user_column_df.shape[0])
  print("完播数量：", user_column_df['is_finished'].sum(), "完播率：", user_column_df['is_finished'].sum() / user_column_df.shape[0])
  print("停留长时间的数量L: ", user_column_df['is_stay'].sum(), "长时间停留率：", user_column_df['is_stay'].sum() / user_column_df.shape[0])
  print("----------------------------------------------------")

用户行为： click_avatar
行为数量： 55128
完播数量： 21493 完播率： 0.38987447395153096
停留长时间的数量L:  18513 长时间停留率： 0.33581845885938183
----------------------------------------------------
用户行为： forward
行为数量： 27963
完播数量： 13326 完播率： 0.4765583091942925
停留长时间的数量L:  10436 长时间停留率： 0.37320745270536065
----------------------------------------------------
用户行为： follow
行为数量： 5277
完播数量： 1766 完播率： 0.33465984460867915
停留长时间的数量L:  1359 长时间停留率： 0.25753268902785675
----------------------------------------------------
用户行为： favorite
行为数量： 9824
完播数量： 6769 完播率： 0.6890268729641694
停留长时间的数量L:  4962 长时间停留率： 0.5050895765472313
----------------------------------------------------
用户行为： read_comment
行为数量： 256242
完播数量： 195418 完播率： 0.7626306382248031
停留长时间的数量L:  149980 长时间停留率： 0.585306077848284
----------------------------------------------------
用户行为： comment
行为数量： 2961
完播数量： 2258 完播率： 0.7625802093887201
停留长时间的数量L:  2429 长时间停留率： 0.8203309692671394
----------------------------------------------------
用户行为： like
行为数量： 188837
完播数量： 12

In [103]:
# 计算用户历史兴趣中的视频关键词tf-idf和tag的tf-idf,作为用户的兴趣表示（基于标签的冷启动策略）
# 但是实际的应用都是按照滑动窗口的情况去计算的
user_action_df['userid'].nunique()

20000

In [104]:
user_action_df[user_action_df['is_finished'] > 0]['userid'].nunique()

19975

In [106]:
user_action_df.columns

Index(['userid', 'feedid', 'date_', 'device', 'read_comment', 'comment',
       'like', 'play', 'stay', 'click_avatar', 'forward', 'follow', 'favorite',
       'feedback', 'authorid', 'videoplayseconds', 'bgm_song_id',
       'bgm_singer_id', 'author_n_feeds', 'videolength_bucket',
       'feed_embedding', 'tag_list', 'keyword_list', 'embedding_0',
       'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4',
       'embedding_5', 'embedding_6', 'embedding_7', 'embedding_8',
       'embedding_9', 'embedding_10', 'embedding_11', 'embedding_12',
       'embedding_13', 'embedding_14', 'embedding_15', 'embedding_16',
       'embedding_17', 'embedding_18', 'embedding_19', 'embedding_20',
       'embedding_21', 'embedding_22', 'embedding_23', 'embedding_24',
       'embedding_25', 'embedding_26', 'embedding_27', 'embedding_28',
       'embedding_29', 'embedding_30', 'embedding_31', 'is_finished',
       'is_stay', 'interested'],
      dtype='object')

In [107]:
user_interested_df = user_action_df[user_action_df['interested'] > 0].groupby('userid')[['keyword_list', 'tag_list']].sum().reset_index()

In [108]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

def create_feed_keyword_profile(feed_info):
  dataset = feed_info['keyword_list'].values
  from gensim.corpora import Dictionary
  dct = Dictionary(dataset)
  corpus = [dct.doc2bow(line) for line in dataset]
  model = TfidfModel(corpus)
  _feed_keywords = []
  _feed_keywords_weights = []
  for i in range(len(corpus)):
    vector = model[corpus[i]]
    feed_keywords = sorted(vector, key=lambda x: x[1], reverse=True)
    keywords_weights = dict(map(lambda x: (dct[x[0]], x[1]), feed_keywords))
    keywords = [i[0] for i in keywords_weights.items()]
    _feed_keywords.append(keywords)
    _feed_keywords_weights.append(keywords_weights)
  return _feed_keywords, _feed_keywords_weights



In [109]:
# 分别计算feed的各个tag_list的权重
def create_feed_tags_profile(feed_info):
  dataset = feed_info['tag_list'].values
  from gensim.corpora import Dictionary
  dct = Dictionary(dataset)
  corpus = [dct.doc2bow(line) for line in dataset]
  model = TfidfModel(corpus)
  _tags = []
  _tags_weights = []
  for i in range(len(corpus)):
    vector = model[corpus[i]]
    feed_tags = sorted(vector, key=lambda x: x[1], reverse=True)
    tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), feed_tags))
    tags = [i[0] for i in tags_weights.items()]
    _tags.append(tags)
    _tags_weights.append(tags_weights)
  return _tags, _tags_weights

In [110]:
_feed_keywords, _feed_keywords_weights = create_feed_keyword_profile(user_interested_df)

In [111]:
user_interested_df['hist_keywords'] = _feed_keywords
user_interested_df['hist_keywords_weights'] = _feed_keywords_weights

In [115]:
tags, tags_weights = create_feed_tags_profile(user_interested_df)
user_interested_df['tag_list'] = tags
user_interested_df['tag_weights'] = tags_weights

In [116]:
user_interested_df

Unnamed: 0,userid,keyword_list,tag_list,hist_keywords,hist_keywords_weights,tag_weights
0,8,"[7978, 9680, 17918, 24390, 131, 22816, 14546, ...","[339, 28, 157, 100, 132, 41, 101, 127, 271, 31...","[17918, 22816, 19660, 131, 7978, 12224, 11696,...","{'17918': 0.3770519071810268, '22816': 0.35289...","{'339': 0.4666739187470669, '28': 0.2877431080..."
1,12,"[7095, 6969, 9709, 6344, 18863, 13338, 8120, 1...","[42, 309, 243, 217, 53, 38, 292, 66, 199, 100,...","[2786, 15902, 23340, 6224, 21455, 18659, 24277...","{'2786': 0.34337765959725347, '15902': 0.24686...","{'42': 0.26569851068585965, '309': 0.263572362..."
2,13,"[6934, 6969, 11411, 2451, 13152, 7377, 18863, ...","[302, 115, 343, 310, 53, 22, 150, 308, 247, 32...","[6969, 15770, 18863, 7377, 21020, 7094, 5618, ...","{'6969': 0.23972037974277463, '15770': 0.20679...","{'302': 0.30492043900991217, '115': 0.28752518..."
3,25,"[15615, 7043, 14535, 17370, 15133, 7094, 4881,...","[156, 317, 125, 101, 192, 221, 220, 328, 170, ...","[4881, 24208, 17969, 15615, 19652, 15133, 7043...","{'4881': 0.30548319466768925, '24208': 0.29837...","{'156': 0.49653632099255307, '317': 0.38988590..."
4,71,"[21358, 17582, 5004, 25007, 24346, 5062, 22312...","[64, 88, 82, 8, 19, 269, 241, 212, 207, 10, 15...","[5004, 21358, 1802, 24346, 5062, 17582, 22312,...","{'5004': 0.47923891424426807, '21358': 0.40477...","{'64': 0.4944578167192278, '88': 0.44175065243..."
5,76,"[25209, 10257, 16288, 19105, 4676, 27166, 1663...","[29, 289, 271, 81, 8, 234, 269, 118, 49, 235, ...","[4676, 9347, 21229, 16634, 27166, 16105, 19760...","{'4676': 0.3141158787154243, '9347': 0.2808933...","{'29': 0.5081459864141487, '289': 0.3866033015..."
6,96,"[4045, 25007, 24346, 940, 22312, 4218, 23139, ...","[69, 11, 142, 241, 17, 219, 159, 6]","[13963, 14859, 23139, 940, 24346, 19686, 5062,...","{'13963': 0.5451843583182836, '14859': 0.39204...","{'69': 0.5517487833771282, '11': 0.54824390003..."
7,99,"[24676, 3963, 10814, 27257, 999, 22548, 3436, ...","[119, 38, 273, 265, 28, 179, 125, 222, 67, 82,...","[16041, 22611, 7206, 14053, 22258, 1369, 1531,...","{'16041': 0.2881098855028171, '22611': 0.27423...","{'119': 0.29297376653059876, '38': 0.277802350..."
8,129,"[13662, 20696, 1371, 22376, 4045, 10215, 10207...","[214, 11, 142, 98, 37, 49, 340, 17, 212, 219, ...","[20348, 13662, 20696, 10207, 22376, 7263, 2294...","{'20348': 0.475670222692859, '13662': 0.435444...","{'214': 0.48960772731668667, '11': 0.471198259..."
9,130,"[26617, 10350, 23733, 11450, 22984, 20613, 12834]","[98, 332, 16, 207, 267, 10, 159, 6]","[11450, 22984, 12834, 23733, 10350, 20613, 26617]","{'11450': 0.4909129908390098, '22984': 0.47798...","{'98': 0.6460351038440703, '332': 0.4363612866..."


In [117]:
user_id_unique_df = user_action_df.groupby('userid')['device'].min().reset_index()
print(user_id_unique_df.columns)

Index(['userid', 'device'], dtype='object')


In [118]:
user_interest_df = pd.merge(user_id_unique_df, user_interested_df, on='userid', how='left')

In [None]:
user_interest_df

Unnamed: 0,userid,device,keyword_list,tag_list,hist_keywords,hist_keywords_weights,tag_weights
0,8,1,"[9680, 7978, 6355, 131, 14546, 22816, 17918, 2...","[41, 228, 12, 202, 266, 100, 23, 157, 340, 339...","[17918, 22816, 19660, 131, 7978, 12224, 11696,...","{'17918': 0.3770519071810268, '22816': 0.35289...","{'41': 0.47123872627268903, '228': 0.455873984..."
1,12,2,"[7095, 8120, 18863, 6344, 6969, 9709, 13338, 1...","[328, 13, 66, 239, 170, 16, 169, 267, 212, 191...","[2786, 15902, 23340, 6224, 21455, 18659, 24277...","{'2786': 0.34337765959725347, '15902': 0.24686...","{'328': 0.5025362696536249, '13': 0.4173037037..."
2,13,2,"[3963, 7377, 18863, 412, 6969, 2451, 6934, 131...","[170, 239, 169, 328, 213, 13, 192, 267, 191, 1...","[6969, 15770, 18863, 7377, 21020, 7094, 5618, ...","{'6969': 0.23972037974277463, '15770': 0.20679...","{'170': 0.4383352051520778, '239': 0.436404453..."
3,25,2,"[17370, 7043, 15615, 15133, 14535, 7094, 24208...","[156, 192, 317, 125, 101, 221, 9, 220, 328, 17...","[4881, 24208, 17969, 15615, 19652, 15133, 7043...","{'4881': 0.30548319466768925, '24208': 0.29837...","{'156': 0.4434659960939803, '192': 0.434737723..."
4,37,1,,,,,
...,...,...,...,...,...,...,...
19995,250207,2,"[7095, 8120, 18863, 6344, 6969, 9709, 13338, 1...","[328, 13, 100, 192, 61, 267, 14, 23, 202, 191,...","[10869, 11813, 21455, 26140, 14412, 23473, 248...","{'10869': 0.5127883136860724, '11813': 0.38604...","{'328': 0.6407291365906322, '13': 0.5320583964..."
19996,250221,2,"[20736, 10350, 17098, 2429, 7732, 23733, 7095,...","[170, 169, 239, 212, 98, 49, 340, 10, 235, 332...","[767, 5187, 2643, 7134, 8593, 13691, 23626, 24...","{'767': 0.7234395642423621, '5187': 0.37856555...","{'170': 0.5257610111061418, '169': 0.512445982..."
19997,250224,2,"[3027, 19320, 5004, 17582, 21358, 8800, 1371, ...","[98, 267, 11, 239, 49, 169, 88, 207, 45, 19, 1...","[20613, 19764, 15476, 18116, 9959, 23763, 2545...","{'20613': 0.20093403829587853, '19764': 0.1607...","{'98': 0.371188774554157, '267': 0.24168053824..."
19998,250229,1,,,,,


In [119]:
user_interest_df.drop(columns=['keyword_list'], inplace=True)

In [120]:
user_interest_df.rename(columns={'tag_list': 'hist_tag', 'tag_weights': 'hist_tag_weights'}, inplace=True)

In [121]:
user_interest_df.drop(columns=['device'], inplace=True)

In [122]:
user_interest_df['hist_tag'] = user_interest_df['hist_tag'].apply(lambda x: x if type(x) != float else [])

In [123]:
user_interest_df['hist_keywords'] = user_interest_df['hist_keywords'].apply(lambda x: x if type(x) != float else [])

In [124]:
user_interest_df['hist_keywords_weights'] = user_interest_df['hist_keywords_weights'].apply(lambda x: x if type(x) != float else {})

In [125]:
user_interest_df['hist_tag_weights'] = user_interest_df['hist_tag_weights'].apply(lambda x: x if type(x) != float else {})

In [128]:
print(user_interest_df.columns)

Index(['userid', 'hist_tag', 'hist_keywords', 'hist_keywords_weights',
       'hist_tag_weights'],
      dtype='object')


In [129]:
import pickle
with open("user_interest.pkl",'wb') as file:
	pickle.dump(user_interest_df,file)

In [130]:
with open('user_interest.pkl', 'rb') as file:
	new_df =pickle.load(file)

In [131]:
feed_info_modified_df.columns

Index(['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id',
       'bgm_singer_id', 'author_n_feeds', 'videolength_bucket',
       'feed_embedding', 'tag_list', 'keyword_list', 'embedding_0',
       'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4',
       'embedding_5', 'embedding_6', 'embedding_7', 'embedding_8',
       'embedding_9', 'embedding_10', 'embedding_11', 'embedding_12',
       'embedding_13', 'embedding_14', 'embedding_15', 'embedding_16',
       'embedding_17', 'embedding_18', 'embedding_19', 'embedding_20',
       'embedding_21', 'embedding_22', 'embedding_23', 'embedding_24',
       'embedding_25', 'embedding_26', 'embedding_27', 'embedding_28',
       'embedding_29', 'embedding_30', 'embedding_31'],
      dtype='object')

In [132]:
feed_info_modified_df['videolength_bucket'] = feed_info_modified_df['videolength_bucket'].apply(lambda x: x if x < 8 else 7)

In [133]:
feed_info_modified_df['author_n_feeds'].hist(bins=200)

<matplotlib.axes._subplots.AxesSubplot at 0x19dac016a58>

In [134]:
import seaborn as sns
from matplotlib import pyplot as plt

feed_info_modified_df['author_n_feeds_bucket'] = round(np.log(feed_info_modified_df['author_n_feeds']))
sns.displot(feed_info_modified_df['author_n_feeds_bucket'])
plt.title('author_n_feed distribution')

AttributeError: module 'seaborn' has no attribute 'displot'

In [135]:
with open("feed_info_modified2.pkl",'wb') as file:
	pickle.dump(feed_info_modified_df, file)

In [136]:
user_watch_sum_df = user_action_df.groupby('userid')['feedid'].nunique().reset_index()

In [137]:
user_stay_sum_df = user_action_df.groupby('userid')['stay'].sum().reset_index()

In [138]:
# 用户总的观看数目，是不是可以用来分桶处理呢
sns.displot(user_watch_sum_df['feedid'])
plt.title('user_watch_sum distribution')

AttributeError: module 'seaborn' has no attribute 'displot'

In [139]:
sns.displot(1.2**(user_watch_sum_df['feedid'].max() + 1 - user_watch_sum_df['feedid']))
plt.title('user_watch_log_sum distribution')

AttributeError: module 'seaborn' has no attribute 'displot'

In [140]:
# 用户停留时长总数，是不是也可以用来分桶呢
sns.displot(user_stay_sum_df['stay'])
plt.title('user_stay_sum distribution')

AttributeError: module 'seaborn' has no attribute 'displot'

In [141]:
sns.displot(np.log(user_stay_sum_df['stay']))
plt.title('user_stay_log_sum distribution')

AttributeError: module 'seaborn' has no attribute 'displot'

In [142]:
user_stay_sum_df

Unnamed: 0,userid,stay
0,8,1414.126
1,12,36224.720
2,13,6862.555
3,25,1098.564
4,37,18121.411
5,71,3618.812
6,76,10509.285
7,96,2805.687
8,99,7078.888
9,108,3973.201


In [143]:
feed_info_modified_df['videolength_bucket']

0         4
1         7
2         2
3         2
4         2
5         5
6         2
7         3
8         7
9         2
10        6
11        1
12        2
13        7
14        2
15        3
16        4
17        4
18        3
19        3
20        7
21        7
22        7
23        3
24        7
25        2
26        7
27        4
28        2
29        1
         ..
106414    3
106415    7
106416    7
106417    7
106418    7
106419    3
106420    4
106421    2
106422    4
106423    2
106424    4
106425    6
106426    1
106427    2
106428    6
106429    3
106430    2
106431    2
106432    1
106433    2
106434    1
106435    2
106436    3
106437    2
106438    3
106439    2
106440    2
106441    3
106442    5
106443    4
Name: videolength_bucket, Length: 106444, dtype: int64