In [1]:
import pandas as pd
import os
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

In [2]:
# 数据路径
DATA_PATH = r"C:/Users/xqy16/Desktop/多模态视频推荐/MGTV_AI_Challenge_Video_Recommend_Rank_12th-main/Data_A/"
CLICK_PATH = os.path.join(DATA_PATH, "用户历史点击数据")
PLAY_PATH = os.path.join(DATA_PATH, "用户历史播放数据")
SHOW_PATH = os.path.join(DATA_PATH, "用户历史曝光数据")
TEST_PRED_PATH = os.path.join(DATA_PATH, "./A榜待预测的did/testA_pred_did.csv")
TEST_SHOW_PATH = os.path.join(DATA_PATH, "./A榜用户曝光数据/testA_did_show.csv")
VID_INFO_PATH = os.path.join(DATA_PATH, "./vid_info/vid_info_table.csv")
DID_FEATURES_PATH = os.path.join(DATA_PATH, "did_features/did_features_table.csv")
DANMU_PATH = os.path.join(DATA_PATH, "弹幕文本数据")

In [3]:
# 内存优化参数
CHUNK_SIZE = 500000
PARALLEL_JOBS = max(1, cpu_count() - 2)

# 分块加载数据
def load_csv_chunks(file_path, day, dtype=None):
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=CHUNK_SIZE, dtype=dtype):
        chunk['day'] = day 
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()

In [4]:
DATA_PARQUET_PATH = r"C:\Users\xqy16\Desktop\多模态视频推荐\用户历史日志_自加特征"

### 弹幕特征分析

In [5]:
pd.set_option('display.max_columns', None)  # 显示所有列

In [6]:
danmu_parquet_df = pd.read_parquet(os.path.join(DATA_PARQUET_PATH, "bullet_fea.parquet"))

In [7]:
danmu_parquet_df.head(10)

Unnamed: 0,vid,content,emotion,bullet_cnt,bullet_len
0,562941,2025.4.15.23点03。好几年都没看过了，来回忆回忆。2025.4.15.23点03...,1,5,62
1,609881,2025。2025。2025年4月11日又来打卡。2025年哦都2025年了哦。2025年4...,1,5,54
2,610769,笑死我了。笑死我了,1,2,9
3,823055,2025.4.15打卡,1,1,11
4,836386,赵英俊！。2025.4.21,1,2,14
5,851923,好可爱的小女孩。2025四月二刷。2025了。看个开头已经开始想哭了😭。2025.4.13二...,1,12,114
6,852959,字幕呢。没字幕啊。很讨厌电影里讲方言。受不了了！我得去找粤语版的看。尬不尬。方言听的真难受。...,0,7,60
7,852961,好牛。怎么没有粤语的,1,2,10
8,860138,电影把徐若瑄的画面全删了。为啥没字幕,0,2,18
9,861277,2055.4.21。wa ～前面的朋友，我2025年4月17日打卡。你们难道不是山里的吗？。...,1,47,588


In [8]:
danmu_parquet_df.describe()

Unnamed: 0,vid,emotion,bullet_cnt,bullet_len
count,7991.0,7991.0,7991.0,7991.0
mean,15379660.0,0.651733,189.340758,2030.318
std,5901000.0,0.476451,3641.430137,36219.46
min,562941.0,0.0,1.0,1.0
25%,10694940.0,0.0,2.0,25.0
50%,16415730.0,1.0,8.0,91.0
75%,20631300.0,1.0,31.0,376.5
max,23336560.0,1.0,236407.0,2282155.0


In [9]:
danmu_parquet_df.to_excel("danmu.xlsx")    # 创一个Excel文件

### baseline代码给出的对弹幕特征的处理

In [10]:
# 读取弹幕数据文件
def read_danmu_files():
    all_dfs = []        # list
    for i in tqdm(range(1, 19), desc="读取弹幕数据文件"): 
        file_name = f"{i}.xlsx"
        file_path = os.path.join(DANMU_PATH, file_name)
        df = pd.read_excel(file_path)
        all_dfs.append(df)
    print("弹幕数据读取完成...")
    result = pd.concat(all_dfs, ignore_index=True)
    del all_dfs
    return result

In [11]:
# 加载弹幕数据，添加列名
def load_danmu_data():
    df_danmu = read_danmu_files()
    if not df_danmu.empty:  
        df_danmu = df_danmu.rename(columns={'videoid': 'vid'})
        print(f"弹幕数据加载完成，记录总数: {len(df_danmu)}")
    else:
        print("未加载到弹幕数据")
    return df_danmu

In [None]:
df_danmu = load_danmu_data()

读取弹幕数据文件:  39%|███▉      | 7/18 [00:50<01:24,  7.70s/it]

新特征：

1. 每个vid的弹幕量

2. 每个vid的弹幕密度

3. 每个vid的所有弹幕的情感分析

4. 高质量弹幕比例，这里的长度先设定为>10，可以以后再调

返回一个dict，包含着vid和弹幕量/弹幕密度的元组pair

In [None]:
# 构建视频弹幕的特征
def calculate_danmu_features(danmu_parquet_df, df_danmu):
    print("计算视频弹幕热度")
    if danmu_parquet_df.empty:
        print("弹幕数据为空...")
        return {}
    
    # 按视频统计弹幕数量
    vid_danmu_counts = danmu_parquet_df.set_index('vid')['bullet_cnt'].to_dict()
    
    # 计算弹幕密度（弹幕数量/视频时长）
    df_vid_info = pd.read_csv(os.path.join(DATA_PATH,"vid_info/vid_info_table.csv"))
    vid_to_duration = df_vid_info.set_index('vid')['item_duration'].to_dict()

    vid_density = {}
    for vid, counts in vid_danmu_counts.items():
        duration = vid_to_duration.get(vid, 2766)   # 优化！duration缺失值可以用mean代替：2766
        vid_density[vid] = counts / duration

    # 计算视频情感分析
    print("计算视频情感分析")
    emotion_judgement = danmu_parquet_df.set_index('vid')['emotion'].to_dict()

    # 计算高质量弹幕比例
    print("计算视频高质量弹幕比例")
    high_quality_length = 10
    df_danmu['content_length'] = df_danmu['content'].str.len()

    high_quality_ratio = (df_danmu['content_length'] > high_quality_length).groupby(df_danmu['vid']).mean().to_dict()
    

    danmu_features = {
        'vid_danmu_count': vid_danmu_counts,
        'vid_danmu_density': vid_density,
        'vid_emotion_judgement': emotion_judgement,
        'high_quality_ratio': high_quality_ratio
    }

    # 清理内存
    del df_vid_info, vid_to_duration, vid_danmu_counts, vid_density, emotion_judgement, high_quality_ratio
    
    return danmu_features

In [None]:
danmu_features = calculate_danmu_features(danmu_parquet_df, df_danmu)

计算视频弹幕热度
计算视频情感分析
计算视频高质量弹幕比例


In [None]:
danmu_features['high_quality_ratio']  # 查看高质量弹幕比例

{562941: 0.6,
 562942: 0.3333333333333333,
 562943: 0.5,
 562944: 0.3333333333333333,
 563512: 0.25,
 563513: 0.3333333333333333,
 563514: 1.0,
 563515: 0.0,
 563516: 1.0,
 563517: 0.5,
 563519: 0.0,
 563520: 1.0,
 563521: 0.0,
 563524: 0.0,
 563526: 0.0,
 563527: 0.0,
 563530: 0.25,
 563531: 0.5,
 563532: 0.0,
 563536: 0.0,
 563537: 0.0,
 563538: 0.5,
 563539: 0.0,
 563541: 0.5,
 563542: 0.5,
 563543: 0.6666666666666666,
 563544: 0.0,
 563546: 0.0,
 563547: 0.25,
 563548: 0.5,
 563567: 0.0,
 563568: 0.6666666666666666,
 563569: 0.5,
 563903: 1.0,
 563905: 1.0,
 563921: 0.0,
 563944: 1.0,
 563946: 0.0,
 563948: 0.0,
 563976: 0.2857142857142857,
 563981: 0.14285714285714285,
 563982: 1.0,
 564023: 0.5,
 564025: 0.0,
 564028: 1.0,
 564030: 0.0,
 564558: 0.0,
 564642: 0.4766355140186916,
 564643: 0.5757575757575758,
 564644: 0.4144736842105263,
 564645: 0.42528735632183906,
 564646: 0.47058823529411764,
 564647: 0.7755102040816326,
 564648: 0.5492957746478874,
 564649: 0.6419753086419753,

文本分析，对vid的弹幕综合情感进行判定，返回一个...

### 用户历史日志特征分析

In [None]:
df_demo = pd.read_parquet(os.path.join(DATA_PARQUET_PATH, "user_history_day02.parquet"))

In [None]:
df_demo.head(10)

Unnamed: 0,did,vid,click_time,play_time,item_cid,item_type,item_duration,item_assetSource,item_classify,item_isIntact,item_serialno,sid,stype,show_sum,click_sum,ctr,ptr,item_cid_click_sum,item_cid_ctr,item_cid_ptr,item_type_click_sum,item_type_ctr,item_type_ptr,item_assetSource_click_sum,item_assetSource_ctr,item_assetSource_ptr,item_classify_click_sum,item_classify_ctr,item_classify_ptr,item_isIntact_click_sum,item_isIntact_ctr,item_isIntact_ptr,sid_click_sum,sid_ctr,sid_ptr,stype_click_sum,stype_ctr,stype_ptr
0,000098cabe490a5bd6773009400a9a92,17620959,2025-03-24 12:26:52,2819.0,1389675,0,5358,0,1,1,5,17119196,0,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
1,000098cabe490a5bd6773009400a9a92,20161959,,,1482548,0,6467,0,1,1,35,19606404,0,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
2,000098cabe490a5bd6773009400a9a92,20290603,,,1499162,2,7726,0,1,1,1,10719952,2,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
3,000098cabe490a5bd6773009400a9a92,23172217,,,1658049,2,7337,0,1,1,25,10998600,2,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
4,000098cabe490a5bd6773009400a9a92,23173979,,,1563214,0,2770,0,2,1,40,22633828,0,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
5,000098cabe490a5bd6773009400a9a92,23176271,,,1671676,0,9806,0,1,1,2,22625776,0,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
6,000098cabe490a5bd6773009400a9a92,23188671,,,1650171,0,5400,0,1,1,33,22633324,0,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
7,0000a93d6cd3a41fdab4239444621b5b,16524164,,,1342781,0,5576,0,1,1,29,15970136,0,10,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
8,0000a93d6cd3a41fdab4239444621b5b,21429953,2025-03-24 22:38:28,3298.0,1545506,0,6586,0,1,1,38,20876194,0,10,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
9,0000a93d6cd3a41fdab4239444621b5b,21457467,2025-03-24 23:13:57,2459.0,1545506,0,5993,0,1,1,40,20902038,0,10,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}


In [None]:
df_demo.describe()

Unnamed: 0,vid,play_time,item_cid,item_type,item_duration,item_assetSource,item_classify,item_isIntact,item_serialno,sid,stype,show_sum,click_sum,ctr,ptr
count,1346235.0,135532.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0
mean,21777310.0,1023.369677,1573323.0,0.283169,5262.463,0.01439496,1.623791,0.9999302,20.08543,19618510.0,0.283169,3.118155,0.3009831,0.04108089,0.07230594
std,3081467.0,970.831498,112311.5,0.6972472,2513.594,0.4647624,3.417269,0.008355809,15.25194,4678840.0,0.6972472,4.430264,0.5819513,0.08226001,0.1883416
min,609881.0,11.0,931765.0,0.0,41.0,0.0,0.0,0.0,0.0,194.0,0.0,0.0,0.0,0.0,0.0
25%,22386020.0,97.0,1547193.0,0.0,2770.0,0.0,1.0,1.0,3.0,18099110.0,0.0,0.0,0.0,0.0,0.0
50%,23173980.0,779.0,1627943.0,0.0,5400.0,0.0,1.0,1.0,22.0,22625780.0,0.0,0.0,0.0,0.0,0.0
75%,23181180.0,1750.0,1658049.0,0.0,7337.0,0.0,2.0,1.0,33.0,22633320.0,0.0,7.0,0.0,0.0,0.0
max,23192560.0,3599.0,1673412.0,2.0,19016.0,34.0,50.0,1.0,437.0,22701810.0,2.0,59.0,9.0,1.0,1.056575


In [None]:
df_demo['item_cid_click_sum'].unique()

array(['{}', '{"1650171": 1}', '{"1563214": 1}', ...,
       '{"1519449": 1, "1525220": 1}',
       '{"1598138": 1, "1650171": 1, "1563214": 1}',
       '{"1296229": 1, "1368420": 3}'], dtype=object)