In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

# --- Notebook Setup ---
# Set a random seed for reproducibility of sampling
random.seed(42)

# Set pandas display options for better text viewing
pd.set_option('display.max_colwidth', 300)

In [None]:
# Load the datasets
try:
    df_track_a = pd.read_json('../data/dev_track_a.jsonl', lines=True)
    df_track_b = pd.read_json('../data/dev_track_b.jsonl', lines=True)
    print("Data loaded successfully.")
    print(f"Track A data shape: {df_track_a.shape}")
    print(f"Track B data shape: {df_track_b.shape}")
except FileNotFoundError:
    print("Error: Make sure the data files are in a 'data/' subdirectory.")

Data loaded successfully.
Track A data shape: (200, 4)
Track B data shape: (479, 1)


In [4]:
# The corpus is the list of all unique stories
corpus = df_track_b['text'].unique()

# Initialize and fit the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)
vectorizer.fit(corpus)

print(f"Vectorizer fitted on a vocabulary of {len(vectorizer.get_feature_names_out())} words.")

Vectorizer fitted on a vocabulary of 3740 words.


In [7]:
# --- Vectorize and Predict ---

def calculate_similarity_and_predict(row):
    """
    Transforms texts to TF-IDF vectors and predicts which text (A or B)
    is more similar to the anchor based on cosine similarity.
    """
    # Transform the texts into vectors
    anchor_vec = vectorizer.transform([row['anchor_text']])
    text_a_vec = vectorizer.transform([row['text_a']])
    text_b_vec = vectorizer.transform([row['text_b']])

    # Calculate cosine similarity
    # cosine_similarity returns a 2D array, so we access the value with [0, 0]
    sim_a = cosine_similarity(anchor_vec, text_a_vec)[0, 0]
    sim_b = cosine_similarity(anchor_vec, text_b_vec)[0, 0]

    # Return the prediction and the similarity scores
    return pd.Series([sim_a > sim_b, sim_a, sim_b])

# Apply the function to each row of the Track A dataframe
df_track_a[['predicted_text_a_is_closer', 'similarity_a', 'similarity_b']] = df_track_a.apply(
    calculate_similarity_and_predict,
    axis=1
)

print("Predictions made based on cosine similarity.")
print("Displaying the first 5 rows with new prediction and similarity columns:")
df_track_a.head()

Predictions made based on cosine similarity.
Displaying the first 5 rows with new prediction and similarity columns:


Unnamed: 0,anchor_text,text_a,text_b,text_a_is_closer,predicted_text_a_is_closer,similarity_a,similarity_b
0,"The book follows an international organization named the Ministry for the Future in its mission to act as an advocate for the world's future generations of citizens as if their rights were as valid as the present generation's. Beginning in 2025, the organization, established as a subsidiary body...",The old grandmother Tina arrives in town to attend the wedding of his nephew Alberto with his girlfriend Ileana.\nUpon arrival she discovers that she has been stolen of a medallion that her late husband had given her.\nHe goes to the police station to file a complaint and get the dear object bac...,The nano-plague that poisoned Earth's water supply has reached its 60-year critical mass. The Unlight enemy forced the first exodus to the moon where the outlawed banished population was supposed to die. But now the Unlights have launched from Earth and are amassing on the south-west sector of t...,False,False,0.0,0.014354
1,"Glenn Tyler (Elvis Presley), a childish 25-year old, gets into a fight with and badly injures his drunken brother. A court releases him on probation into the care of his uncle in a small town, appointing Irene Sperry (Hope Lange) to give him psychological counselling. Marked as a trouble-maker, ...","Bill Babbitt supported the death penalty, until it came knocking at his door. Bill fondly recalls early life with his brother Manny, but a childhood car accident leaves Manny forever changed. Two tours in Vietnam only compound Manny's mental health issues. After the war, bouts of paranoia leave ...","A white-collar suburban father Kyle (Fran Kranz) is surprised at his office by long-lost college buddy Zack (Adam Goldberg). Zack is as wild and crazy as ever, brimming with excitement about the self-actualization program he's just finished called Rebirth. He talks Kyle into going on a weekend-l...",True,True,0.057168,0.032455
2,"Signaller Charles Plumpick (Bates) is a kilt-wearing French-born Scottish soldier caring for war pigeons, who is sent by his commanding officer to disarm a bomb placed in the town square by the retreating Germans.\nAfter the townspeople learn about the booby trap, its inhabitants—including those...","Sid, Russ and Jerry are three wannabe criminals looking for easy money to break out of their nowhere lives. Despite a bungled jewelry store heist that exposes their incompetence, they are convinced they can pull off an armored-truck robbery. While plotting their caper, their dysfunctional famili...","Brendan Byers III is a rich playboy who enlists to fight in the war against the Axis powers, but is classified 4-F. He really wants to fight, so he enlists other 4-Fs and some loyal volunteers from his own service staff and forms his own army, financing their training and equipment. Once comple...",False,False,0.012828,0.02505
3,Barbara is married to the distinguished professor of medicine Georg Bertram who once saved her father's life. When they have a mentally handicapped child together his clinical coldness comes to the fore and he wants to commit euthanasia on the child. She stops him and takes the child away to Bri...,"Eddie Quinn's unruly wife Maureen drinks and smokes to excess, even though she is pregnant. Eddie has troubles of his own, disappearing for days at a time. When she is physically and sexually assaulted by Kiefer, a neighbor, it is more than Eddie can handle. He shoots someone and lands in a psyc...","Jerome Littlefield is an orderly at a hospital. His dream is to be a doctor, but he has a problem that prevents it from becoming a reality: when he hears of a problem that a patient is having, psychosomatically he begins to suffer those symptoms as well.\nSusan Andrews, an old high school frien...",False,False,0.012732,0.04548
4,"A wealthy widower locks up his two grown-up children, afraid that they will go mad, as did his wife. He then invites a doctor of dubious reputation to supervise their mental health and cure them of the unnatural attraction they have for each other. Meanwhile, in the vicinity of the mansion, murd...",Barbara is married to the distinguished professor of medicine Georg Bertram who once saved her father's life. When they have a mentally handicapped child together his clinical coldness comes to the fore and he wants to commit euthanasia on the child. She stops him and takes the child away to Bri...,"Stefano (Lino Capolicchio) arrives in a village of the Valli di Comacchio area where he has been employed to restore a fresco depicting what appears to be the martyrdom of Saint Sebastian, which has been painted on a rotting wall of the local church by a mysterious, long-dead artist named Legnan...",False,False,0.037961,0.043657


In [8]:
# Calculate the accuracy by comparing our prediction with the ground truth label
accuracy = (df_track_a['predicted_text_a_is_closer'] == df_track_a['text_a_is_closer']).mean()

print(f"TF-IDF Baseline Accuracy: {accuracy:.4f}")

# Identify the rows where the prediction was incorrect
failures_df = df_track_a[df_track_a['predicted_text_a_is_closer'] != df_track_a['text_a_is_closer']].copy()

print(f"\nIdentified {len(failures_df)} failure cases out of {len(df_track_a)} total examples.")

# Save failures to JSONL file for later analysis
failures_df.to_json('failures.jsonl', orient='records', lines=True)

print("Saved all failure cases to 'failures.jsonl'.")

print("\n--- Random Sample of Failure Cases ---")
failures_df.sample(5)

TF-IDF Baseline Accuracy: 0.5250

Identified 95 failure cases out of 200 total examples.
Saved all failure cases to 'failures.jsonl'.

--- Random Sample of Failure Cases ---


Unnamed: 0,anchor_text,text_a,text_b,text_a_is_closer,predicted_text_a_is_closer,similarity_a,similarity_b
41,"The French detective superintendent Christophe Vade (Patrick Bruel) investigates the murder of an elderly French couple in the French Alps. In the course of his investigation he encounters Jeanne Gardella (Mathilda May), the wife of the businessman Antoine Gardella (Jacques Dutronc), a member of...","British newlywed Regina Lambert lives in Paris with her husband Charles. She returns home following a short vacation, determined to divorce Charles only to discover their apartment has been stripped bare and that her husband has been murdered. The French police are in her apartment. Charles ha...","Simon, a retired police inspector, does not appreciate being placed in a retirement home. Fortunately, he quickly becomes friends with Alfred, another resident. When Alfred dies under strange circumstances, the management, supported by the gendarmerie, declare it an accident. Simon, meanwhile, i...",True,False,0.041325,0.056401
37,"Simon Spier is a closeted, gay, 16-year-old student in his junior year of high school with a fondness for musical theater who lives in a suburb of Atlanta, Georgia. Unbeknownst to his family and friends, Simon has been sending e-mails to a person going by the name of ""Blue"", Simon himself using ...","Daniel is an odd guy who lives with his endlessly quarrelling parents uncomplaining about his destiny. He keeps a distance from other people, he has no friends, nobody understands him, he is different. He will be turning nineteen and the last thing he would spend his time on is a preparation for...","Shuji Ito (Yoshinori Okada) is a shy boy in the top class at secondary school. He feels attracted to his classmate and best friend Yoshida (Kōta Kusano), who is not aware of Ito's intimate feelings. The two spend time with Tōru Kanbara (Kōji Yamaguchi), whose comic actions hide his sensitive nat...",False,True,0.044231,0.04086
170,"Dragon (Jackie Chan) is the son of a Chinese aristocrat who is always getting in trouble, and likes to skip his lessons. \nDragon tries to send a love note to the girl he likes via a kite, but the kite gets away. Dragon tries to get the kite and letter back which have landed on the roof of the h...","As the film opens Ahmad (Babak Ahmadpour), a grade schooler, watches as his teacher (Khodabakhsh Defai) berates a fellow student, Mohammad Reza, for repeatedly failing to use his notebook for his homework, threatening expulsion on the next offense. When Ahmad returns home, he realizes he's accid...","The son of the Countess Mensdorf runs away when he can no longer stand her relationship with the Baron Von Mallock. The son becomes the famous trapeze artist Frattani, and after many years he returns home and meets Madeleine, a young dancer. They fall in love and he wants to give up the circus a...",True,False,0.0,0.031623
166,"Four unrelated shorts by four different directors. ""Queen Sabina"" chronicles the sexual misadventures of a teenage girl on the road home. ""Queen Armenia"" centers on a self-saving opportunistic gypsy babysitter who uses her employer's kids for her own gain. The third episode, ""Queen Elena"" center...","The story follows the decadent heir Henri de Marsay, who becomes enamored of the beautiful Paquita Valdes, and his plan to seduce her. He succeeds but becomes disillusioned when he discovers she is involved with another lover, and so he plots to murder her. When he arrives to kill her, he discov...","In 1835 Paris, Ryno de Marigny (Fu'ad Aït Aattou), before marrying the young and innocent Hermangarde (Roxanne Mesquida), makes a last visit to La Vellini (Asia Argento), his Spanish mistress, to bid goodbye in an act of lovemaking. His liaison with La Vellini is the subject of Parisian gossip, ...",False,True,0.010612,0.0
52,"Bassi and Edwin are the two laziest, most irresponsible construction workers in their workers' brigade. When they fall in love with their neighbors Thea and Susi, they pretend to be reliable young men and accompany their friends to a meeting of the local chapter house. To keep up appearances, th...","Tilla Morland is a major operetta star. Celebrating with friends at a fancy restaurant, she is asked to sing the hit song from her new triumph. To her outrage one of the customers gets up and leaves during her performance. A few days later the same man, an ex-army officer, turns up as her new pr...","This is the story of a young clerk who has failed at everything he has tried in his life so far. He enters a six-day bicycle race to impress his girlfriend and hilarious hijinks ensue. He eventually wins this race, marries his girlfriend and they live happily ever after.",False,True,0.101134,0.029038


In [9]:
# Select 25 random samples from the failures dataframe for analysis
analysis_sample = failures_df.sample(25)

Created 'musaab_error_analysis.md' with 25 failure cases ready for your analysis.

Please open this file in a text editor to complete your qualitative analysis.


In [11]:
# putting the info in md
markdown_template = f"""# Assignment 1: TF-IDF Baseline Error Analysis

**Team Member:** Muhammad Musaab ul Haq
**Baseline Accuracy:** 0.5250

---

## Failure Case Analysis

"""

for index, row in analysis_sample.iterrows():
    correct_choice = 'A' if row['text_a_is_closer'] else 'B'
    model_choice = 'A' if row['predicted_text_a_is_closer'] else 'B'

    markdown_template += f"### Analysis of Case {index}\n\n"
    markdown_template += f"**Truth:** {correct_choice} | **Predicted:** {model_choice} "
    markdown_template += f"(A: {row['similarity_a']:.3f}, B: {row['similarity_b']:.3f})\n\n"

    markdown_template += "<details>\n<summary>Story Texts</summary>\n\n"
    markdown_template += f"**Anchor:** {row['anchor_text']}\n\n"
    markdown_template += f"**Choice A:** {row['text_a']}\n\n"
    markdown_template += f"**Choice B:** {row['text_b']}\n\n"
    markdown_template += "</details>\n\n"

    markdown_template += "**Why the Human was right:** \n\n"
    markdown_template += "**Why the TF-IDF Model failed:** \n\n"
    markdown_template += "**Error Category:** ``\n\n"
    markdown_template += "---\n\n"

with open('A1_Musaab_error_text.md', 'w', encoding='utf-8') as f:
    f.write(markdown_template)

print("Created 'A1_Musaab_error_text.md' with 25 failure cases.")

Created 'A1_Musaab_error_text.md' with 25 failure cases.
