In [107]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import sklearn

# Load the dataset
original_df = pd.read_csv('top_10000_popular_movies_tmdb.csv')

# Select the relevant columns
df = original_df[['title', 'release_date', 'genres', 'overview', 'popularity', 'revenue']]

# Create a new row with null values
null_row = pd.DataFrame([{
    'title': 'Null movie',
    'release_date': float('nan'),
    'genres': "[]",
    'overview': float('nan'),
    'popularity': float('nan'),
    'revenue': 0
}])

df = pd.concat([null_row, df], ignore_index=True)

# Finds missing values in the dataset
for feature in df.columns:
    missing_data = df[df[feature].isnull()]
    print(f"Number of missing values in {feature}: {len(missing_data)}")

# Convert each row to a formatted string and store in a new column
df['formatted_string'] = df.apply(
    lambda row: f"{row['title']}, released on {'an unknown date' if pd.isnull(row['release_date']) else row['release_date']}, is a {'movie with unknown genre(s)' if row['genres'] == '[]' else f'{row['genres']} movie'} with a plot of: {'movie\'s plot is unknown.' if pd.isnull(row['overview']) else f'that is about {row['overview']}'} It has a popularity score of {'an unknown amount' if pd.isnull(row['popularity']) else row['popularity']}, assigned to the movie by TMDB based on user engagement. It generated {'an unknown amount' if row['revenue'] == 0 else f'{row['revenue']} USD'} in revenue.",
    axis=1
)


Number of missing values in title: 0
Number of missing values in release_date: 24
Number of missing values in genres: 0
Number of missing values in overview: 78
Number of missing values in popularity: 2
Number of missing values in revenue: 2


In [108]:
# Load the model
model_BAAI = SentenceTransformer('BAAI/bge-small-en')

# Encode the formatted strings
embeddings = pd.Series(df["formatted_string"]).apply(lambda x: model_BAAI.encode(str(x)))

# Add the embeddings to the DataFrame
df["embedding"] = embeddings


In [153]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_flan = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

# Define the query
query = "Give me the plot of the super mario bros movie."
#query = "Which movie was released first? the Dark Knight or Crater"
#query = "how much revenue did the movie the dark knight make?"

query_embedding = model_BAAI.encode(query)

# Compute cosine similarity for each formatted string
similarity_dict = {}
for index, row in df.iterrows():
    cosine_sim = sklearn.metrics.pairwise.cosine_similarity([row["embedding"]], [query_embedding])[0][0]
    similarity_dict[row["formatted_string"]] = cosine_sim

# Sort the similarities in descending order and get the top k formatted strings
k = 2
top_k_strings = sorted(similarity_dict.items(), key=lambda item: item[1], reverse=True)[:k]

# Print the top k formatted strings with their cosine similarities
print(f"Top {k} formatted strings with the highest cosine similarities:")
for formatted_string, similarity in top_k_strings:
    print(f"Movie: {formatted_string}")
    print(f"Cosine similarity: {similarity}\n")

# Use the formatted string with the highest cosine similarity in the instruction prompt
instruction_prompt = f"Based on the following information, {query}: {top_k_strings} don't make up any new infotmation just use the information given."

inputs = tokenizer(instruction_prompt, return_tensors="pt")
outputs = model_flan.generate(**inputs)
print(f"{query}:")
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Top 2 formatted strings with the highest cosine similarities:
Movie: The Super Mario Bros. Movie, released on 2023-04-05, is a ['Animation', 'Family', 'Adventure', 'Fantasy', 'Comedy'] movie with a plot of: that is about While working underground to fix a water main, Brooklyn plumbers—and brothers—Mario and Luigi are transported down a mysterious pipe and wander into a magical new world. But when the brothers are separated, Mario embarks on an epic quest to find Luigi. It has a popularity score of 3394.458, assigned to the movie by TMDB based on user engagement. It generated 1308766975.0 USD in revenue.
Cosine similarity: 0.870617151260376

Movie: Super Mario Bros., released on 1993-05-28, is a ['Adventure', 'Fantasy', 'Comedy', 'Family', 'Science Fiction'] movie with a plot of: that is about Mario and Luigi, plumbers from Brooklyn, find themselves in an alternate universe where evolved dinosaurs live in hi-tech squalor. They're the only hope to save our universe from invasion by the d