In [1]:
import pandas as pd
import sklearn
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


# Load the dataset
original_df = pd.read_csv('top_10000_popular_movies_tmdb.csv')

# Select the relevant columns
df = original_df[['title', 'release_date', 'genres', 'overview', 'popularity', 'revenue']]

# Create a new row with null values
null_row = pd.DataFrame([{
    'title': 'Null movie',
    'release_date': float('nan'),
    'genres': "[]",
    'overview': float('nan'),
    'popularity': float('nan'),
    'revenue': 0
}])

df = pd.concat([null_row, df], ignore_index=True)

df['formatted_string'] = df.apply(
    lambda row: f"{row['title']}, released on {'an unknown date' if pd.isnull(row['release_date']) else row['release_date']}, is a {'movie with unknown genre(s)' if row['genres'] == '[]' else row['genres'] + ' movie'} with a plot of: {'movie’s plot is unknown.' if pd.isnull(row['overview']) else 'that is about ' + row['overview']}. It has a popularity score of {'an unknown amount' if pd.isnull(row['popularity']) else row['popularity']}, assigned to the movie by TMDB based on user engagement. It generated {'an unknown amount' if row['revenue'] == 0 else str(row['revenue']) + ' USD'} in revenue.",
    axis=1
)

In [2]:
# Load the models
model_BAAI = SentenceTransformer('BAAI/bge-small-en')
model_flan = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

# Encode the formatted strings
embeddings = pd.Series(df["formatted_string"]).apply(lambda x: model_BAAI.encode(str(x)))
df["embedding"] = embeddings

In [3]:
# List of questions
questions = [
    "What is the plot of 'The Shawshank Redemption'?",
    "Which movie has the highest revenue?",
    "Tell me about the movie released in 1994 with Tim Robbins.",
    "What genre is 'Inception'?",
    "How much revenue did 'Avatar' generate?",
    "Describe a horror movie in the dataset.",
    "Which movie was released first, 'Titanic' or 'The Matrix'?",
    "What is the most popular animated movie?",
    "Give me the plot of 'The Super Mario Bros. Movie'.",
    "Find a movie about space exploration.",
    "Which movie made the least revenue?",
    "What are some comedy movies released after 2010?",
    "Tell me about a movie with no known genre.",
    "What’s the popularity score of 'The Dark Knight'?",
    "Describe a movie with a missing release date.",
    "Which movie has the longest title?",
    "Tell me about a musical movie.",
    "Find a movie that mentions time travel.",
    "What’s the earliest release date in the dataset?",
    "Which movie has the highest popularity score?",
    "Describe a romance movie with high revenue.",
    "Find a documentary in the dataset.",
    "Tell me about a movie with a futuristic plot.",
    "What movie has the word 'dragon' in the title?",
    "Describe a movie with no overview available.",
    "Which movie has the smallest popularity score?",
    "What is the revenue of the oldest movie?",
    "Tell me about a war movie in the dataset.",
    "Find a movie about artificial intelligence.",
    "Describe a movie that has both action and sci-fi genres."
]

# Process each question
results = []

for query in questions:
    query_embedding = model_BAAI.encode(query)

    # Compute cosine similarity for each formatted string
    similarity_dict = {}
    for index, row in df.iterrows():
        cosine_sim = sklearn.metrics.pairwise.cosine_similarity([row["embedding"]], [query_embedding])[0][0]
        similarity_dict[row["formatted_string"]] = cosine_sim

    # Get the top 2 most similar results
    top_k_strings = sorted(similarity_dict.items(), key=lambda item: item[1], reverse=True)[:2]

    # Simplified instruction prompt
    instruction_prompt = f"Question: {query}\nTop Match: {top_k_strings[0][0]}\nAnswer:"

    # Tokenize the input
    inputs = tokenizer(instruction_prompt, return_tensors="pt")

    # Check token length to ensure it's within limits
    input_length = inputs['input_ids'].shape[1]
    if input_length > 512:  # Truncate if input is too long
        instruction_prompt = instruction_prompt[:450]  # Shorten the prompt if needed
        inputs = tokenizer(instruction_prompt, return_tensors="pt")

    # Generate response
    outputs = model_flan.generate(**inputs, max_length=150, num_return_sequences=1)

    # Decode the output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Post-process response if necessary
    if isinstance(response, list):
        response = " ".join(response)  # Ensure it's a clean string

    # Store the result
    results.append({
        "Question": query,
        "Top Match 1": top_k_strings[0][0],
        "Cosine Similarity 1": top_k_strings[0][1],
        "Top Match 2": top_k_strings[1][0] if len(top_k_strings) > 1 else None,
        "Cosine Similarity 2": top_k_strings[1][1] if len(top_k_strings) > 1 else None,
        "Generated Response": response
    })

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('qa_pairs_results.csv', index=False)

print("Q-A pairs saved to 'qa_pairs_results.csv'!")

Q-A pairs saved to 'qa_pairs_results.csv'!


In [4]:
df1 = pd.read_csv("qa_pairs_results - labeled.csv")

In [5]:
df1.head()

Unnamed: 0,Question,Top Match 1,Cosine Similarity 1,Top Match 2,Cosine Similarity 2,Generated Response,Ground Truth,Retrived_Score,Response_Score
0,What is the plot of 'The Shawshank Redemption'?,"The Shawshank Redemption, released on 1994-09-...",0.890571,"24: Redemption, released on 2008-11-23, is a [...",0.828072,Andy Dufresne begins a new life at the Shawsha...,"""The Shawshank Redemption"" follows Andy Dufres...",1,1
1,Which movie has the highest revenue?,"Everest, released on 2015-09-10, is a ['Advent...",0.837667,"The Jewel of the Nile, released on 1985-08-01,...",0.837623,Top Match: Everest,"As of recent data, ""Avatar"" holds the record f...",0,0
2,Tell me about the movie released in 1994 with ...,"The Getaway, released on 1994-02-11, is a ['Ac...",0.841428,"Annie, released on 1999-11-07, is a ['Music', ...",0.8377,It has a popularity score of 18.11,"""The Shawshank Redemption"" (1994), starring Ti...",0,0
3,What genre is 'Inception'?,"Inception, released on 2010-07-15, is a ['Acti...",0.867426,"Coma, released on 2019-11-19, is a ['Science F...",0.832511,science fiction,"""Inception"" (2010) is a sci-fi, action, and th...",1,1
4,How much revenue did 'Avatar' generate?,"Avatar, released on 2009-12-15, is a ['Action'...",0.879301,"Avatar: Creating the World of Pandora, release...",0.86017,2923706026.0 USD,"""Avatar"" (2009) has grossed over $2.8 billion ...",1,1


In [7]:
df1['Combined_Correct'] = ((df1['Retrived_Score'] == 1) & (df1['Response_Score'] == 1)).astype(int)

In [8]:
df1.head()

Unnamed: 0,Question,Top Match 1,Cosine Similarity 1,Top Match 2,Cosine Similarity 2,Generated Response,Ground Truth,Retrived_Score,Response_Score,Combined_Correct
0,What is the plot of 'The Shawshank Redemption'?,"The Shawshank Redemption, released on 1994-09-...",0.890571,"24: Redemption, released on 2008-11-23, is a [...",0.828072,Andy Dufresne begins a new life at the Shawsha...,"""The Shawshank Redemption"" follows Andy Dufres...",1,1,1
1,Which movie has the highest revenue?,"Everest, released on 2015-09-10, is a ['Advent...",0.837667,"The Jewel of the Nile, released on 1985-08-01,...",0.837623,Top Match: Everest,"As of recent data, ""Avatar"" holds the record f...",0,0,0
2,Tell me about the movie released in 1994 with ...,"The Getaway, released on 1994-02-11, is a ['Ac...",0.841428,"Annie, released on 1999-11-07, is a ['Music', ...",0.8377,It has a popularity score of 18.11,"""The Shawshank Redemption"" (1994), starring Ti...",0,0,0
3,What genre is 'Inception'?,"Inception, released on 2010-07-15, is a ['Acti...",0.867426,"Coma, released on 2019-11-19, is a ['Science F...",0.832511,science fiction,"""Inception"" (2010) is a sci-fi, action, and th...",1,1,1
4,How much revenue did 'Avatar' generate?,"Avatar, released on 2009-12-15, is a ['Action'...",0.879301,"Avatar: Creating the World of Pandora, release...",0.86017,2923706026.0 USD,"""Avatar"" (2009) has grossed over $2.8 billion ...",1,1,1


In [10]:
combined_accuracy = df1['Combined_Correct'].mean()
print(f"Combined Accuracy: {combined_accuracy}")

# Calculate Retrieval Accuracy
retrieval_accuracy = df1['Retrived_Score'].mean()
print(f"Retrieval Accuracy: {retrieval_accuracy}")

# Calculate Response Accuracy
response_accuracy = df1['Response_Score'].mean()
print(f"Response Accuracy: {response_accuracy}")

Combined Accuracy: 0.3333333333333333
Retrieval Accuracy: 0.5
Response Accuracy: 0.3333333333333333
