In [None]:
!pip install datasets pandas pymongo sentence_transformers
!pip install -U transformers
# Install below if using GPU
!pip install accelerate

In [None]:
!pip install --upgrade datasets fsspec

## 1. Load dataset Movie from huggingface

In [14]:
# Load Dataset
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/MongoDB/embedded_movies
dataset = load_dataset("MongoDB/embedded_movies")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(5)

Unnamed: 0,plot,runtime,genres,fullplot,directors,writers,countries,poster,languages,cast,title,num_mflix_comments,rated,imdb,awards,type,metacritic,plot_embedding
0,Young Pauline is left a lot of money when her ...,199.0,[Action],Young Pauline is left a lot of money when her ...,"[Louis J. Gasnier, Donald MacKenzie]","[Charles W. Goddard (screenplay), Basil Dickey...",[USA],https://m.media-amazon.com/images/M/MV5BMzgxOD...,[English],"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",The Perils of Pauline,0,,"{'id': 4465, 'rating': 7.6, 'votes': 744}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.0007293965299999999, -0.026834568000000003,..."
1,A penniless young man tries to save an heiress...,22.0,"[Comedy, Short, Action]",As a penniless man worries about how he will m...,"[Alfred J. Goulding, Hal Roach]",[H.M. Walker (titles)],[USA],https://m.media-amazon.com/images/M/MV5BNzE1OW...,[English],"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",From Hand to Mouth,0,TV-G,"{'id': 10146, 'rating': 7.0, 'votes': 639}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.022837115, -0.022941574000000003, 0.014937..."
2,"Michael ""Beau"" Geste leaves England in disgrac...",101.0,"[Action, Adventure, Drama]","Michael ""Beau"" Geste leaves England in disgrac...",[Herbert Brenon],"[Herbert Brenon (adaptation), John Russell (ad...",[USA],,[English],"[Ronald Colman, Neil Hamilton, Ralph Forbes, A...",Beau Geste,0,,"{'id': 16634, 'rating': 6.9, 'votes': 222}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.00023330492999999998, -0.028511643000000003..."
3,"Seeking revenge, an athletic young man joins t...",88.0,"[Adventure, Action]",A nobleman vows to avenge the death of his fat...,[Albert Parker],"[Douglas Fairbanks (story), Jack Cunningham (a...",[USA],https://m.media-amazon.com/images/M/MV5BMzU0ND...,,"[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",The Black Pirate,1,,"{'id': 16654, 'rating': 7.2, 'votes': 1146}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[-0.005927917, -0.033394486, 0.0015323418, -0...."
4,An irresponsible young millionaire changes his...,58.0,"[Action, Comedy, Romance]","The Uptown Boy, J. Harold Manners (Lloyd) is a...",[Sam Taylor],"[Ted Wilde (story), John Grey (story), Clyde B...",[USA],https://m.media-amazon.com/images/M/MV5BMTcxMT...,[English],"[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...",For Heaven's Sake,0,PASSED,"{'id': 16895, 'rating': 7.6, 'votes': 918}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.0059373598, -0.026604708, -0.0070914757000..."


In [15]:
dataset_df.shape

(1500, 18)

In [16]:
dataset_df = dataset_df.dropna(subset=['fullplot'])
# show count nan each column
print(dataset_df.isna().sum())

# drop clolumn flot_embedding
dataset_df = dataset_df.drop(columns=['plot_embedding'])
dataset_df.columns

plot                    0
runtime                14
genres                  0
fullplot                0
directors              12
writers                13
countries               0
poster                 78
languages               1
cast                    1
title                   0
num_mflix_comments      0
rated                 279
imdb                    0
awards                  0
type                    0
metacritic            893
plot_embedding          1
dtype: int64


Index(['plot', 'runtime', 'genres', 'fullplot', 'directors', 'writers',
       'countries', 'poster', 'languages', 'cast', 'title',
       'num_mflix_comments', 'rated', 'imdb', 'awards', 'type', 'metacritic'],
      dtype='object')

In [17]:
dataset_df.shape

(1452, 17)

## 2. Load model BERT from huggingface thenlper/gte-large to embedding

In [18]:
# embedding fullplot, add in dataset column
from sentence_transformers import SentenceTransformer
# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer('thenlper/gte-large')

# return tolist float embdding
def get_embedding(text:str)-> list[float]:
  if not text.strip():
    print("Attempted to get embedding for empty text.")
    return []
  return embedding_model.encode(text).tolist()

dataset_df['fullplot_embedding'] = dataset_df['fullplot'].apply(get_embedding)
dataset_df.head()

Unnamed: 0,plot,runtime,genres,fullplot,directors,writers,countries,poster,languages,cast,title,num_mflix_comments,rated,imdb,awards,type,metacritic,fullplot_embedding
0,Young Pauline is left a lot of money when her ...,199.0,[Action],Young Pauline is left a lot of money when her ...,"[Louis J. Gasnier, Donald MacKenzie]","[Charles W. Goddard (screenplay), Basil Dickey...",[USA],https://m.media-amazon.com/images/M/MV5BMzgxOD...,[English],"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",The Perils of Pauline,0,,"{'id': 4465, 'rating': 7.6, 'votes': 744}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[-0.009285837411880493, -0.005062109790742397,..."
1,A penniless young man tries to save an heiress...,22.0,"[Comedy, Short, Action]",As a penniless man worries about how he will m...,"[Alfred J. Goulding, Hal Roach]",[H.M. Walker (titles)],[USA],https://m.media-amazon.com/images/M/MV5BNzE1OW...,[English],"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",From Hand to Mouth,0,TV-G,"{'id': 10146, 'rating': 7.0, 'votes': 639}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.0024393806234002113, 0.02309592440724373, ..."
2,"Michael ""Beau"" Geste leaves England in disgrac...",101.0,"[Action, Adventure, Drama]","Michael ""Beau"" Geste leaves England in disgrac...",[Herbert Brenon],"[Herbert Brenon (adaptation), John Russell (ad...",[USA],,[English],"[Ronald Colman, Neil Hamilton, Ralph Forbes, A...",Beau Geste,0,,"{'id': 16634, 'rating': 6.9, 'votes': 222}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.012204288505017757, -0.011455747298896313, ..."
3,"Seeking revenge, an athletic young man joins t...",88.0,"[Adventure, Action]",A nobleman vows to avenge the death of his fat...,[Albert Parker],"[Douglas Fairbanks (story), Jack Cunningham (a...",[USA],https://m.media-amazon.com/images/M/MV5BMzU0ND...,,"[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",The Black Pirate,1,,"{'id': 16654, 'rating': 7.2, 'votes': 1146}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.00454134214669466, -0.0006100605824030936, ..."
4,An irresponsible young millionaire changes his...,58.0,"[Action, Comedy, Romance]","The Uptown Boy, J. Harold Manners (Lloyd) is a...",[Sam Taylor],"[Ted Wilde (story), John Grey (story), Clyde B...",[USA],https://m.media-amazon.com/images/M/MV5BMTcxMT...,[English],"[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...",For Heaven's Sake,0,PASSED,"{'id': 16895, 'rating': 7.6, 'votes': 918}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.00222560903057456, 0.011567801237106323, -..."


In [19]:
import numpy as np

# Convert the first embedding list to a NumPy array and then access its shape
print(np.array(dataset_df['fullplot_embedding'][0]).shape)

(1024,)


## 3. Use google colab connet MongoDB Atlat. stored dataset
- MongoDB Atlat vector index 'fullplot emmbđing'
- similer : cosine

In [20]:
import pymongo
from google.colab import userdata
# Connect to MongoDB

# connet MongoDB Atlas
def get_mongo_client(mongo_uri):
  """Establish connection to the MongoDB."""
  try:
    client = pymongo.MongoClient(mongo_uri, appname="devrel.content.python")
    print("Connection to MongoDB successful")
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f"Connection failed: {e}")
    return None

mongo_uri = "mongodb+srv://van:ky2IL62Lpcb1co4D@cluster0.0bycefo.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
if not mongo_uri:
  print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

Connection to MongoDB successful


In [21]:
# Ingest data into MongoDB
db = mongo_client['DataBase']
collection = db['Movie']

In [22]:
mongo_client['DataBase']['Movie'].count_documents({})

1452

In [23]:
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 1452, 'electionId': ObjectId('7fffffff0000000000000027'), 'opTime': {'ts': Timestamp(1750614360, 146), 't': 39}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1750614360, 146), 'signature': {'hash': b"\xe8\xbf\xcc\x0e'\xaa\x86\x02+\x83S\xc7/W<\x0b\x146\xc5\xcc", 'keyId': 7491280248286216197}}, 'operationTime': Timestamp(1750614360, 146)}, acknowledged=True)

In [24]:
documents = dataset_df.to_dict("records")
collection.insert_many(documents)

print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [25]:
mongo_client['DataBase']['Movie'].count_documents({})

1452

In [26]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    vector_search_stage = {
        "$vectorSearch": {
            "index": "vector_index",
            "queryVector": query_embedding,
            "path": "fullplot_embedding",
            "numCandidates": 150,  # Number of candidate matches to consider
            "limit": 4  # Return top 4 matches
        }
    }

    unset_stage = {
        "$unset": "fullplot_embedding"  # Exclude the 'embedding' field from the results
    }

    project_stage = {
        "$project": {
            "_id": 0,  # Exclude the _id field
            "fullplot": 1,  # Include the plot field
            "title": 1,  # Include the title field
            "genres": 1, # Include the genres field
            "score": {
                "$meta": "vectorSearchScore"  # Include the search score
            }
        }
    }

    pipeline = [vector_search_stage, unset_stage, project_stage]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [27]:
def get_search_result(query, collection):

    get_knowledge = vector_search(query, collection)

    search_result = ""
    for result in get_knowledge:
        print('---result', result)
        search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('fullplot', 'N/A')}\n"

    return search_result

## 4. Search Vectoer FAISS in MongoDB Atlat

In [28]:
%%time
# Conduct query with retrival of sources
query = "What is the worst action movie to watch and why?. Pick one of 4 below:"
source_information = get_search_result(query, collection)
source_information

---result {'genres': ['Action', 'Comedy', 'Romance'], 'fullplot': 'A movie about a good-for-nothing stuntman called Mike Gaucher who can\'t do one single job without something going wrong. His fiancèe wants to kill him, the film studios don\'t want to give him any more work and, as King Kong for a promotion in a supermarket, he winds up destroying some display shelves. The only role he plays well is pretending not to be able to hear, speak and walk in order to get money from the government. Then along comes the "beautiful" actor called Bruno Ferrari. Supposedly a fearless stuntman, Ferrari turns out to be afraid of heights once his feet leave the ground. So a stuntman is needed to prevent a scandal. And who is the man who looks most (or exactly) like Bruno? You guessed it, good-for-nothing Mike...', 'title': 'Animal', 'score': 0.9146993160247803}
---result {'genres': ['Action', 'Comedy', 'Crime'], 'fullplot': "Orin Boyd (Seagal) is a Detroit cop who doesn't follow rules. After he saved

'Title: Animal, Plot: A movie about a good-for-nothing stuntman called Mike Gaucher who can\'t do one single job without something going wrong. His fiancèe wants to kill him, the film studios don\'t want to give him any more work and, as King Kong for a promotion in a supermarket, he winds up destroying some display shelves. The only role he plays well is pretending not to be able to hear, speak and walk in order to get money from the government. Then along comes the "beautiful" actor called Bruno Ferrari. Supposedly a fearless stuntman, Ferrari turns out to be afraid of heights once his feet leave the ground. So a stuntman is needed to prevent a scandal. And who is the man who looks most (or exactly) like Bruno? You guessed it, good-for-nothing Mike...\nTitle: Exit Wounds, Plot: Orin Boyd (Seagal) is a Detroit cop who doesn\'t follow rules. After he saved the Vice President by violating every order he received he is transferred to one of the worst precincts in the city. There he quick

In [29]:
combined_information = f"Query: {query} \n {source_information}."

print(combined_information)

Query: What is the worst action movie to watch and why?. Pick one of 4 below: 
 Title: Animal, Plot: A movie about a good-for-nothing stuntman called Mike Gaucher who can't do one single job without something going wrong. His fiancèe wants to kill him, the film studios don't want to give him any more work and, as King Kong for a promotion in a supermarket, he winds up destroying some display shelves. The only role he plays well is pretending not to be able to hear, speak and walk in order to get money from the government. Then along comes the "beautiful" actor called Bruno Ferrari. Supposedly a fearless stuntman, Ferrari turns out to be afraid of heights once his feet leave the ground. So a stuntman is needed to prevent a scandal. And who is the man who looks most (or exactly) like Bruno? You guessed it, good-for-nothing Mike...
Title: Exit Wounds, Plot: Orin Boyd (Seagal) is a Detroit cop who doesn't follow rules. After he saved the Vice President by violating every order he received 

## 5. Log in and load LLM model from huggingface (google gamma)

In [17]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## 7. Generate

In [31]:
# Moving tensors to GPU
%%time
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: What is the worst action movie to watch and why?. Pick one of 4 below: 
 Title: Animal, Plot: A movie about a good-for-nothing stuntman called Mike Gaucher who can't do one single job without something going wrong. His fiancèe wants to kill him, the film studios don't want to give him any more work and, as King Kong for a promotion in a supermarket, he winds up destroying some display shelves. The only role he plays well is pretending not to be able to hear, speak and walk in order to get money from the government. Then along comes the "beautiful" actor called Bruno Ferrari. Supposedly a fearless stuntman, Ferrari turns out to be afraid of heights once his feet leave the ground. So a stuntman is needed to prevent a scandal. And who is the man who looks most (or exactly) like Bruno? You guessed it, good-for-nothing Mike...
Title: Exit Wounds, Plot: Orin Boyd (Seagal) is a Detroit cop who doesn't follow rules. After he saved the Vice President by violating every order he rece