# Data Preprocessing

In [None]:
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/MongoDB/embedded_movies
dataset = load_dataset("MongoDB/embedded_movies")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(5)

In [None]:
dataset_df = dataset_df.dropna(subset=["fullplot"])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with an open source embedding model from Hugging Face
dataset_df = dataset_df.drop(columns=["plot_embedding"])
dataset_df.head(5)

In [None]:
from sentence_transformers import SentenceTransformer

# Load pre-trained model and tokenizer
embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()


dataset_df["embedding"] = dataset_df["fullplot"].apply(get_embedding)

dataset_df.head()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def query_system(query, df, top_n=5):
    # Get the embedding of the query
    query_embedding = get_embedding(query)

    # Compute the cosine similarity between the query and all embeddings in the dataframe
    similarities = cosine_similarity([query_embedding], df['embedding'].tolist())[0]

    # Add the similarity scores to the dataframe
    df['similarity'] = similarities

    # Sort the dataframe by similarity and return the top results
    top_results = df.sort_values(by='similarity', ascending=False).head(top_n)

    return top_results


# Query the system
query = "What are some good romantic movies to watch?"
top_results = query_system(query, dataset_df)

top_results

# LLM querying

In [None]:
from openai import OpenAI


def query_gemma(original_query: str, top_docs: pd.DataFrame):
    # Send a request to the Gemma model with the top 5 documents as the system message and the original query as the user message

    context = ""

    titles = top_docs["title"].tolist()
    plots = top_docs["fullplot"].tolist()
    genres = top_docs["genres"].tolist()

    for i in range(len(titles)):
        context += f"Title:{titles[i]},  Plot: {plots[i]}, Genres: {genres[i]}\n"
        
    print(context)

    client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
    completion = client.chat.completions.create(
        model="local-model",
        messages=[
            {"role": "system",
             "content": f"Given this csv about movies in my system {context} answer the following query"},
            {"role": "user", "content": original_query}
        ],
        temperature=0.7,
    )

    return completion

In [None]:
temp = query_gemma(query, top_results)

In [None]:
temp