In [None]:
import pandas as pd
from datasets import load_dataset  # Requires transformers>=4.51.0
from sentence_transformers import SentenceTransformer

In [None]:
ds = load_dataset("rohitsaxena/MovieSum")

In [None]:
train = ds["train"]
validation = ds["validation"]
test = ds["test"]
dftr = pd.DataFrame(train)
dfva = pd.DataFrame(validation)
dfte = pd.DataFrame(test)

In [None]:
df = pd.concat([dftr, dfva, dfte])
df

In [None]:
df2 = df.reset_index().drop("index", axis=1)
df2.head(30)

In [None]:
# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")  # maybe try reranker as well?

# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# together with setting `padding_side` to "left":
# model = SentenceTransformer(
#     "Qwen/Qwen3-Embedding-0.6B",
#     model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
#     tokenizer_kwargs={"padding_side": "left"},
# )


In [None]:
MODELPROMPT = "Instruct: Given a movie search query, retrieve relevant movies that match the query in mood, genre, and plot\nQuery:"

In [None]:
# The queries and documents to embed
queries = [
    "I want a sci-fi movie where Earth is destroyed and other planets are explored. It should involve wormholes and time dilation. The main character should use black holes to communicate with the past.",
    "I want a sci-fi movie where the world is a simulation. The main character is a programmer who one day is presented with a choice to remain in the simulation or break free from it.",
    "Private investigator Tom Welles is contacted by Daniel Longdale, attorney for wealthy widow Mrs. Christian. While clearing out her recently deceased husband's safe, she and Longdale find an 8mm movie which appears to depict a real murder of a girl, but Mrs. Christian wants to know for certain. Welles is instructed by Longdale and Mrs. Christian not to reproduce the film in any way, and to keep the investigation secret. After looking through missing persons files, Welles discovers the girl is Mary Ann Mathews and visits her mother, Janet, in North Carolina. She allows Welles to search her house, and he finds Mary Ann's diary, in which she says she relocated to Hollywood to become a film star. Mrs. Mathews demands that he find out what happened to her daughter. In Hollywood, aided by adult video store clerk Max California, Welles infiltrates the world of underground fetish pornography. He attempts to find out if snuff films are real, or if anyone was connected to this film, but finds no evidence that a snuff film industry exists. Contact with sleazy talent scout Eddie Poole leads Welles and Max to director Dino Velvet; Velvet's violently pornographic films often feature a masked man named \"Machine\", who not only brutalizes and tortures women, but also resembles the man featured in Christian's snuff film. To gain more evidence, Welles and California travel to New York and pose as clients interested in commissioning a hardcore BDSM film to be directed by Velvet and starring Machine. Velvet appears to agree, and schedules a meeting. The meeting turns out to be an ambush, as Longdale and Poole appear and hold Welles at gunpoint. The film is revealed as authentic; Mr. Christian contacted Longdale to procure a snuff film, and being unable to find one, Longdale commissioned Velvet and Poole to make one. Velvet and Machine produce a bound and beaten Max, whom they abducted to force Welles to bring them the only surviving copy of the film. As Longdale and Welles go to Welles's car to retrieve the film, Longdale admits he never thought Welles would get as far as he did, and just wanted to placate Mrs. Christian with the investigation. Once Welles delivers the film, Longdale and Velvet burn it and kill Max. As they are about to kill Welles, he confesses Mr. Christian paid $1 million for the film; Velvet, Poole, and Machine received $50,000 and Longdale kept the major portion. In an ensuing fight, Velvet and Longdale are killed; Welles wounds Machine and escapes. Subsequently, he informs Mrs. Christian regarding his findings and recommends she contact the police. Arriving at her estate, he is shocked to hear that she committed suicide immediately after receiving the phone call and notices envelopes that she left behind both for the Mathews family and for him; his envelope contains the rest of his payment and a note reading, \"Try to forget us.\" He instructs his wife Amy and their infant daughter Cindy to escape from the impending danger to a secret location, donating his half of the money to her. Deciding to avenge Mary Ann's death by killing the remaining people involved, Welles returns to Hollywood, tracks down Poole, and takes him to the shooting location, but stops short of killing him. He calls Mrs. Mathews to tell her about her daughter and asks for her permission to punish those responsible. Mrs. Mathews breaks down once presented with the truth, but affirms that she loved her daughter and permits him to proceed. With that, Welles pistol whips Poole to death with his gun, burning his body and the pornography from his car. He then attacks Machine at his home and unmasks him, revealing a bespectacled man named George Anthony Higgins, who admits that his sadistic actions are simply done out of pleasure, and are unrelated to any personal trauma. Welles then kills him in the ensuing struggle. After returning to his family, Welles breaks down in front of Amy, attempting to process all the evil things that he had witnessed throughout the investigation. Months later, he receives a letter from Mrs. Mathews, thanking him and relating her gratitude for the fact that, despite everything, they were the only two people who really cared about Mary Ann.",
]
summaries = df2["summary"]
summaries = list(summaries)


In [None]:
# Encode the queries and documents. Note that queries benefit from using a prompt
# Here we use the prompt called "query" stored under `model.prompts`, but you can
# also pass your own prompt via the `prompt` argument
query_embeddings = model.encode(
    queries,
    prompt=MODELPROMPT,
)
summaries_embeddings = model.encode(
    summaries, batch_size=32, precision="float32", show_progress_bar=True
)

# Compute the (cosine) similarity between the query and document embeddings
similarity = model.similarity(query_embeddings, summaries_embeddings)
print(similarity)
# tensor([[0.7646, 0.1414],
#         [0.1355, 0.6000]])

In [None]:
model.prompts

In [None]:
summaries_embeddings.shape

In [None]:
summaries_embeddings.dump("summaries_embeddings")