In [1]:
import os
import json
import random
import torch
from transformers import AutoTokenizer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DataArguments:
    text_column_name = "cot_result"
    label_column_name = "target_movie"
    max_seq_length = 1536
    item_max_length = 192
    num_negative_samples = 7
    movie_name_path = "dataset/qwen/filtered_movies_qwen.csv"
    movie_info_path = "dataset/redial/matched_dict_info_strip.json"

sample_data = [{"cot_result": "Based on the user interaction records provided, let's analyze the user's movie preferences step by step:\n\n1. **Childhood Preference:**\n   - The user mentions liking horror movies like \"Misery\" (1990) when they were a kid. This indicates an early interest in the horror genre, specifically psychological thrillers or horror with a strong narrative focus.\n\n2. **Current Preference:**\n   - The user now prefers movies like \"The Martian\" (2015), which is a science fiction film that focuses on survival and problem-solving in a challenging environment. This suggests a current preference for intelligent, engaging, and thought-provoking content, particularly within the science fiction genre.\n   - The user also mentions liking \"interesting content like that,\" which implies a general preference for movies that are intellectually stimulating and engaging.\n\n3. **Influence of a Friend:**\n   - The user has a friend who loves comedies like \"Bad Moms\" (2016) and tries to get the user to watch similar movies. However, the user does not explicitly state that they enjoy these comedies, only that their friend does. This suggests that while the user might occasionally watch comedies, they are not a primary genre of interest for the user.\n\n**Inference:**\n- The user's current movie preferences lean towards intelligent, engaging, and thought-provoking content, particularly within the science fiction genre. They seem to have outgrown their childhood preference for horror and now prefer movies that are intellectually stimulating and engaging, such as \"The Martian\" (2015).\n- While the user might watch comedies occasionally due to their friend's influence, it does not appear to be a primary genre of interest for them.\n\n**Conclusion:**\nThe user prefers intelligent, engaging, and thought-provoking content, especially within the science fiction genre, as exemplified by their enjoyment of \"The Martian\" (2015). They have moved away from their childhood preference for horror and do not seem to have a strong preference for comedies, despite their friend's influence.",
               "target_movie": "The Princess Bride  (1987)"},
               {"target_movie": "The Princess Bride  (1987)", 
                "cot_result": "Based on the user interaction records, we can infer several preferences and patterns in the user's movie tastes:\n\n1. **Action and Thriller Movies**: The user has a strong interest in action and thriller movies. This is evident from their mentions of \"John Wick (2014)\", \"Die Hard (1988)\", \"The Matrix (1999)\", \"Split (2017)\", \"Taken (2008)\", \"Taken 2 (2012)\", \"Ip Man (2008)\", and \"Mad Max: Fury Road (2015)\". They also mentioned enjoying \"Raiders of the Lost Ark (1981)\" and \"True Lies (1994)\" which are action and thriller films.\n\n2. **Comedy Movies**: The user enjoys comedy movies, as seen from their mentions of \"Liar Liar (1997)\", \"Ride Along 2 (2016)\", \"Central Intelligence (2016)\", \"Wedding Crashers (2005)\", \"Step Brothers (2008)\", \"The Hangover (2009)\", \"Bad Moms (2016)\", \"Anchorman: The Legend of Ron Burgundy (2004)\", \"21 Jump Street (2012)\", \"Elf (2003)\", \"The 40-Year-Old Virgin (2005)\", and \"Meet the Parents (2000)\".\n\n3. **Superhero and Fantasy Movies**: The user has an interest in superhero and fantasy movies, as indicated by their mentions of \"Wonder Woman (2017)\", \"Deadpool (2016)\", \"Guardians of the Galaxy (2014)\", \"Spider-Man: Homecoming (2017)\", \"The Avengers (2012)\", \"Justice League (2017)\", \"Harry Potter and the Goblet of Fire (2005)\", \"Stardust (2007)\", and \"Maleficent (2014)\".\n\n4. **Animated Movies**: The user enjoys animated movies, particularly those that are family-friendly and have a comedic element. This is shown by their mentions of \"The Secret Life of Pets (2016)\", \"The Lego Batman Movie (2017)\", \"Inside Out (2015)\", \"The Boss Baby (2017)\", \"Trolls (2016)\", \"Penguins of Madagascar (2014)\", and \"Tangled (2010)\".\n\n5. **Crime and Mystery Movies**: The user has an interest in crime and mystery movies, as seen from their mentions of \"Identity (2003)\" and \"Jigsaw (2017)\".\n\n6. **Romantic and Drama Movies**: The user has shown interest in romantic and drama movies, as indicated by their mentions of \"Steel Magnolias (1989)\" and \"The Break-Up (2006)\".\n\n7. **Interest in Sequels and Franchises**: The user seems to follow franchises and sequels, as seen from their mentions of \"Taken 2 (2012)\", \"Ride Along 2 (2016)\", and \"Wonder Woman (2017)\".\n\n8. **Interest in Recent Releases**: The user is interested in recent releases, as seen from their mentions of \"Wonder Woman (2017)\", \"Bright (2017)\", \"Girls Trip (2017)\", \"The House (2017)\", and \"Rough Night (2017)\".\n\nBased on these observations, the user's movie preferences can be summarized as a mix of action, comedy, superhero, fantasy, animated, crime, and drama movies, with a particular interest in recent releases and franchises."}
               ,{"initiatorWorkerId": 147, "conversationId": "16160", "messageId": 161002, "messages": ["Hi, Have you seen the new Jumanji  (2017)", "Yes, I've seen Night at the Museum: Secret of the Tomb (2014) too. Not as good as the first.", "I've heard about that. The Wonder  (2016) maybe a good one with kids.", "Jumanji  (2017)", "Logan  (2017)", "Jumanji  (2017)", "Wind River  (2017)", "The Hurt Locker (2008)", "She also loved Mr. Popper's Penguins ", "Yes! Cute movie. As good as Finding Nemo (2003)", "She also loved Beauty and the Beast  (2017)", "Oh, I don't think She's seem the Monsters, Inc. (2001)", "Never seen Hoot  (2006) .", "I love The Princess Bride  (1987)", "Ohhh! Loved Because of Winn-Dixie  (2005) ! She's never seen it. What a good idea!", "Jumanji  (2017)", "Jumanji (1995)", "Logan  (2017)", "X-Men ", "Jumanji  (2017)", "The Wonder  (2016)", "The Outsiders  (1983)", "Alien  (1979)", "Logan  (2017)", "X-Men  (2000)", "John Wick (2014)", "I hven't seen it. Was it as good as the Jumanji (1995) first one?", "Wasn't there a new Star Wars movie out?", "LOVE The Sandlot (1993)", "Angels in the Outfield  (1994)", "Jumanji (1995)", "The Outsiders  (1983)", "Wonder  (2017)", "Trolls  (2016)", "The Boss Baby (2017)", "Frozen  (2010)", "Star Wars: Episode VIII – The Last Jedi (2017)", "Star Wars: Episode VIII – The Last Jedi (2017)", "The Outsiders  (1983)", "The Last Picture Show (1971)", "Stand by Me  (1986)", "I do like old movie. We say Homeward Bound: The Incredible Journey (1993)", "Also The Sandlot (1993)", "I love animal movies...like Fly Away Home (1996)", "I would look for Kedi  (2010) . Thanks!", "Anything family friendly. We do like animal moves like Homeward Bound: The Incredible Journey (1993)", "We also like sports movies. Something like The Sandlot (1993)", "and Matilda  (1996)", "Coco  (2017)", "Homeward Bound: The Incredible Journey (1993)", "Aladdin  (1992)", "Beethoven  (1992)", "My grand daughter loved Homeward Bound: The Incredible Journey (1993)", "I think she will like The Avengers  (2012)", "I love some older moveis...like The Outsiders  (1983)", "Or Fly Away Home (1996)", "Because of Winn-Dixie  (2005)", "Homeward Bound: The Incredible Journey (1993)", "Fly Away Home (1996)", "Because of Winn-Dixie  (2005)", "All Dogs Go to Heaven 2 (1996)", "The Rescuers Down Under", "Older ones are fine, too. We just watched Air Bud (1997)", "Another one I still like is Lilo & Stitch", "Also love The Parent Trap  (1961)", "How about The Sandlot (1993) ?", "Homeward Bound: The Incredible Journey (1993)", "Maleficent  (2014)", "Star Wars", "Cars  (2006)", "The Lego Batman Movie (2017)", "The Outsider  (1981)", "My Cousin Vinny (1992)", "Four Weddings and a Funeral (1994)", "Spaceballs (1987)", "I just watched Fly Away Home (1996)", "I loved the first The Incredibles (2004)", "Oh, yes! The Parent Trap  (1998) with the Lindsey Lohan was good, too.", "I did like Maleficent  (2014) though.", "We watched The Sandlot (1993) recently and loved it.", "I do like Aladdin  (1992) !", "We don't mind older ones. We like movies like Homeward Bound: The Incredible Journey (1993)", "Love The Lion King (1994) ! Haven't watched in a long time!", "We also liked The Parent Trap  (1998)", "Also the older one The Parent Trap  (1961)", "Benji: Off the Leash! (2004)", "Fly Away Home (1996)", "Air Bud (1997)", "I liked Tarzan  (1999)", "We like old movies, too, like Homeward Bound: The Incredible Journey (1993)", "I liked Air Bud (1997)", "We liked Moana  (2016)", "Oh, Loved Mulan (1998)", "Lilo & Stitch is one of my favorite as we lived in Hawaii...", "We also like older movies like The Parent Trap  (1961)", "Out of Africa  (1985)", "Fat, Sick and Nearly Dead (2010)", "My all time favorite is The Outsiders  (1983)", "Another movie I liked is 50 First Dates (2004)", "The Patriots  (1994) was good, too.", "The Outsider  (1981)", "It  (2017)", "50 First Dates (2004)", "Maybe something long and boring like Out of Africa  (1985)", "Zoolander (2001) was funny too.", "Oh, I loved Dances with Wolves (1990) !", "Either. We did like Homeward Bound: The Incredible Journey (1993)", "Also enjoyed Moana  (2016)", "I liked Beethoven  (1992) but I don't think I've seen the other.", "I also like the original The Rescuers", "An American Tail: Fievel Goes West (1991) is another I liked.", "I like ones based on true events, like Hidden Figures (2016)", "Doesn't have to be true story...I liked Dances with Wolves (1990)", "The Boy in the Striped Pyjamas  (2008) was too sad...", "I think I'll look for The Pianist  (2002)", "Recently watched Hidden Figures (2016) and loved it."], "target_movie": "The Outsiders  (1983)", "cot_result": "Based on the user interaction records, we can infer several preferences and patterns in the user's movie choices:\n\n1. **Family-Friendly Movies**: The user frequently mentions movies that are suitable for children and families, such as \"Homeward Bound: The Incredible Journey,\" \"The Sandlot,\" \"Fly Away Home,\" \"Lilo & Stitch,\" \"The Parent Trap,\" and \"Aladdin.\" This suggests a preference for movies that can be enjoyed by both children and adults.\n\n2. **Animal-Themed Movies**: There is a notable interest in movies that feature animals prominently, such as \"Homeward Bound: The Incredible Journey,\" \"Fly Away Home,\" \"Beethoven,\" and \"All Dogs Go to Heaven 2.\" This indicates a fondness for stories centered around animals.\n\n3. **Classic and Older Movies**: The user often refers to older movies, such as \"The Outsiders,\" \"The Parent Trap (1961),\" \"The Lion King,\" and \"Aladdin.\" This suggests a preference for classic films and a willingness to watch movies from different eras.\n\n4. **Adventure and Action Movies**: There are mentions of action and adventure movies like \"Jumanji (2017),\" \"Logan,\" \"The Hurt Locker,\" and \"John Wick.\" This indicates an interest in more intense and action-packed films.\n\n5. **Animated Movies**: Animated films are frequently mentioned, including \"Frozen,\" \"Trolls,\" \"Coco,\" \"The Incredibles,\" and \"Moana.\" This shows a preference for animated content, which can be enjoyed by a wide audience.\n\n6. **Movies Based on True Stories**: The user mentions enjoying movies based on true events, such as \"Hidden Figures\" and \"Dances with Wolves.\" This suggests an interest in films that are grounded in real-life stories.\n\n7. **Sports Movies**: There is a mention of liking sports movies, specifically \"The Sandlot,\" indicating a preference for films that incorporate sports themes.\n\n8. **Fantasy and Sci-Fi**: The user has shown interest in fantasy and science fiction movies, such as \"Star Wars: Episode VIII – The Last Jedi,\" \"The Princess Bride,\" and \"Beauty and the Beast (2017).\"\n\nBased on these observations, the user seems to have a diverse taste in movies, with a strong inclination towards family-friendly, animal-themed, and classic films, while also enjoying action, adventure, and animated content.", "id": 4459}
]

movie_df = pd.read_csv(DataArguments.movie_name_path)
movie_list = [name.strip() for name in movie_df["movieName"]]

with open(DataArguments.movie_info_path) as f:
    movie_info_dict = json.load(f)

tokenizer = AutoTokenizer.from_pretrained("/mnt/wangxiaolei/model/Qwen/gte-Qwen2-7B-instruct")




In [3]:
def create_movie_text(movie_name, movie_info_dict):
    """create structured movie text from movie name and movie info dict"""
    movie_name = movie_name.strip()
    assert movie_name in movie_info_dict, f"Movie name {movie_name} not found in movie info dict"

    info = movie_info_dict[movie_name]
    movie_text = f"movie: {movie_name}\n"
    movie_text += f"title: {info['title']}\n"
    movie_text += f"year: {info['year']}\n"
    movie_text += f"genre: {', '.join(info['genre'])}\n"
    movie_text += f"director: {', '.join(info['director'])}\n"
    movie_text += f"writer: {', '.join(info['writer'])}\n"
    movie_text += f"star: {', '.join(info['star'])}\n"
    movie_text += f"plot: {info['plot']}\n"

    return movie_text

In [4]:
def preprocess_function(examples, tokenizer, movie_list, movie_info_dict, DataArguments):
        user_texts = examples[DataArguments.text_column_name]
        user_encodings = tokenizer(
            user_texts,
            padding=False, # 在data_collator中padding
            max_length=DataArguments.max_seq_length,
            truncation=True,
        )

        movie_name_to_idx = {name.strip(): i for i, name in enumerate(movie_list)}
        num_negatives = DataArguments.num_negative_samples
        batch_size = len(examples[DataArguments.label_column_name])

        # 三维结构容器
        all_item_texts = []
        movie_indices = []

        for i in range(batch_size):
            # ========== 正样本处理 ==========
            movie_name = examples[DataArguments.label_column_name][i].strip()
            pos_idx = movie_name_to_idx[movie_name]
            movie_indices.append(pos_idx)
            
            # ========== 负样本采样 ==========
            all_indices = set(range(len(movie_list)))
            all_indices.remove(pos_idx)
            neg_indices = random.sample(list(all_indices), num_negatives)

            # ========== 生成正负样本文本 ==========
            pos_text = create_movie_text(movie_name, movie_info_dict)
            neg_texts = [create_movie_text(movie_list[idx], movie_info_dict) for idx in neg_indices]
            all_item_texts.extend([pos_text] + neg_texts)  # 先展平为二维结构

        # ========== 逐样本分词 ==========
        item_encodings = tokenizer(
            all_item_texts,
            padding=True,  # 在data_collator中统一填充
            truncation=True,
            max_length=DataArguments.item_max_length,
            return_tensors="pt",
        )

        num_items = 1 + num_negatives
        return {
            "user_input_ids": user_encodings["input_ids"],
            "user_attention_mask": user_encodings["attention_mask"],
            "item_input_ids": item_encodings["input_ids"].view(batch_size, num_items, -1),
            "item_attention_mask": item_encodings["attention_mask"].view(batch_size, num_items, -1),
            "labels": movie_indices,
        } # 重新构造三维结构 [B, K+1, L]

In [12]:
from datasets import Dataset

sample_dataset = Dataset.from_dict({
    DataArguments.text_column_name: [d["cot_result"] for d in sample_data],
    DataArguments.label_column_name: [d["target_movie"] for d in sample_data],
})

processed_output = preprocess_function(sample_dataset[:3], tokenizer, movie_list, movie_info_dict, DataArguments)

# ========== 5. 结果检查 ==========
print("用户输入 IDs：", processed_output["user_input_ids"])
print("用户输入 IDs 形状：", len(processed_output["user_input_ids"]))
print("用户输入 IDs 形状：", len(processed_output["user_input_ids"][0]))
print("用户掩码：", processed_output["user_attention_mask"])
print("物品输入 IDs：", processed_output["item_input_ids"])
print("物品掩码：", processed_output["item_attention_mask"])
print("标签：", processed_output["labels"])

用户输入 IDs： [[28715, 389, 279, 1196, 16230, 7424, 3897, 11, 1077, 594, 23643, 279, 1196, 594, 5700, 19322, 3019, 553, 3019, 1447, 16, 13, 3070, 3652, 5694, 48805, 25, 1019, 256, 481, 576, 1196, 33845, 48737, 21452, 9508, 1075, 330, 83159, 722, 1, 320, 16, 24, 24, 15, 8, 979, 807, 1033, 264, 10369, 13, 1096, 14807, 458, 4124, 2734, 304, 279, 21452, 17328, 11, 11689, 23187, 53804, 388, 476, 21452, 448, 264, 3746, 19221, 5244, 382, 17, 13, 3070, 5405, 48805, 25, 1019, 256, 481, 576, 1196, 1431, 53964, 9508, 1075, 330, 785, 81028, 1, 320, 17, 15, 16, 20, 701, 892, 374, 264, 8038, 16989, 4531, 429, 23497, 389, 19661, 323, 3491, 98146, 304, 264, 17003, 4573, 13, 1096, 13230, 264, 1482, 21933, 369, 24514, 11, 22570, 11, 323, 3381, 9838, 85, 10746, 2213, 11, 7945, 2878, 279, 8038, 16989, 17328, 624, 256, 481, 576, 1196, 1083, 33845, 48737, 330, 87557, 2213, 1075, 429, 1335, 892, 23945, 264, 4586, 21933, 369, 9508, 429, 525, 93291, 64692, 323, 22570, 382, 18, 13, 3070, 641, 40016, 315, 264, 11587

In [16]:
import pandas as pd
import json

movie_df = pd.read_csv("dataset/qwen/filtered_movies_qwen.csv")
movie_list_raw = [name.strip() for name in movie_df["movieName"]]
movie_set = set()
duplicates = []
for movie in movie_list_raw:
    if movie in movie_set:
        duplicates.append(movie)
    else:
        movie_set.add(movie)
if duplicates:
    print(f'Found {len(duplicates)} duplicate movies: {duplicates}')
movie_list = list(movie_set)
print(f"{len(movie_list)} movies has been loaded")

with open("dataset/redial/matched_dict_info_strip.json", 'r', encoding='utf-8') as f:
    movie_info_dict = json.load(f)
print(f"Finish loading {len(movie_info_dict)} movie info")

Found 2 duplicate movies: ['Burlesque  (2010)', 'The Bridge  (2006)']
4205 movies has been loaded
Finish loading 4205 movie info


In [14]:
movie_name_to_idx = {name.strip(): i for i, name in enumerate(movie_list)}

# 获取movie_name_to_idx中的最小的value
min_value = min(movie_name_to_idx.values())
print(f'min_value: {min_value}')

max_value = max(movie_name_to_idx.values())
print(f'max_value: {max_value}')

# 获取movie_name_to_idx中的最小的key
min_key = min(movie_name_to_idx, key=movie_name_to_idx.get)
print(f'min_key: {min_key}')

# 获取movie_name_to_idx中的最大的key
max_key = max(movie_name_to_idx, key=movie_name_to_idx.get)
print(f'max_key: {max_key}')


min_value: 0
max_value: 4204
min_key: The 'Burbs (1989)
max_key: Rogue One: A Star Wars Story (2016)


In [5]:
import pandas as pd
from collections import defaultdict

def find_duplicate_movies(csv_path):
    # 读取CSV文件
    df = pd.read_csv(csv_path)
    
    # 创建字典来存储处理后的电影名称和对应的movieId列表
    movie_dict = defaultdict(list)
    
    # 遍历DataFrame的每一行
    for index, row in df.iterrows():
        movie_name = row['movieName'].strip()
        movie_id = row['movieId']
        movie_dict[movie_name].append(movie_id)
    
    # 找出有重复的电影名称
    duplicates = {name: ids for name, ids in movie_dict.items() if len(ids) > 1}
    
    # 输出结果
    if duplicates:
        print("发现以下重复的电影名称：")
        for name, ids in duplicates.items():
            print(f"电影名称: '{name}'")
            print(f"对应的movieId: {', '.join(map(str, ids))}")
            print("-" * 50)
    else:
        print("没有发现重复的电影名称。")

# 使用函数
csv_path = "dataset/qwen/filtered_movies_qwen.csv"
find_duplicate_movies(csv_path)

发现以下重复的电影名称：
电影名称: 'Burlesque  (2010)'
对应的movieId: 99066, 164798
--------------------------------------------------
电影名称: 'The Bridge  (2006)'
对应的movieId: 122411, 191956
--------------------------------------------------
