# Create ChromaDB embeddings Model. 

### Installing Dependencies
Pip install libraries as required

In [None]:
import pandas as pd
import ast
import chromadb

## Fetch Cleaned Dataset into dataframe

In [17]:
df_original = pd.read_csv('./../cleaned_database/cleaned_final_dataset3.csv')

save a copy incase you modify the df.

In [6]:
df = df_original.copy()

## Seperate IDs, Documents, and Metadata

In [7]:
ids = df["id"].astype(str).tolist()
documents = df["docs"].astype(str).tolist()
metadata = df.drop(columns=["id", "docs"]).to_dict(orient="records")

### print and check some entries to ensure data has been fetched correctly

In [8]:
for i in range(min(5, len(ids))):
    print(f"\nRecord {i+1}")
    print(f"ID: {ids[i]}")
    print(f"Document: {documents[i]}...")
    print(f"Metadata keys: {list(metadata[i].keys())}")
    print(f"Metadata sample: {metadata[i]}...") 



Record 1
ID: tt0073195
Document: jaws massive killer shark unleashes chaos beach community long island local sheriff marine biologist old seafarer hunt beast royscheider robertshaw richarddreyfuss lorrainegary murrayhamilton carlgottlieb jeffreykramer susanbacklinie jonathanfilley tedgrossman stevenspielberg monsterhorror seaadventure survival adventure drama horror thriller zanuckbrownproductions universalpictures water street edgartown marthas vineyard massachusetts usa english unitedstates...
Metadata keys: ['title', 'year', 'duration', 'MPA', 'rating', 'votes', 'meta_score', 'description', 'Movie_Link', 'writers', 'directors', 'stars', 'budget', 'opening_weekend_gross', 'gross_worldwide', 'gross_us_canada', 'release_date', 'countries_origin', 'filming_locations', 'production_companies', 'awards_content', 'genres', 'languages']
Metadata sample: {'title': 'Jaws', 'year': 1975, 'duration': 124, 'MPA': 'PG', 'rating': 8.1, 'votes': '690K', 'meta_score': 87.0, 'description': "When a ma

## ChromaDB 


### Declare a local chromaDB client with path in this repository

In [None]:
chroma_client = chromadb.PersistentClient(path="./../chromadb_client")

### Create a collection

In [14]:
collection = chroma_client.create_collection(name="best_movies_database")

### Incase you have alreay created a collection eariler, you can also fetch a collection

In [None]:
collection = chroma_client.get_collection(name="best_movies_database")



### Insert into collection using (.add) function 

In [None]:
def insert_in_batches(batch_size=100):
    for i in range(0, len(ids), batch_size):
        batch_ids = ids[i:i+batch_size]
        batch_docs = documents[i:i+batch_size]
        batch_meta = metadata[i:i+batch_size]
        
        collection.add(
            ids=batch_ids,
            documents=batch_docs,
            metadatas=batch_meta
        )
        print(f"Inserted batch {i // batch_size + 1}")

In [None]:
insert_in_batches(batch_size=1000)

Inserted batch 1
Inserted batch 2
Inserted batch 3
Inserted batch 4
Inserted batch 5
Inserted batch 6
Inserted batch 7
Inserted batch 8
Inserted batch 9
Inserted batch 10
Inserted batch 11
Inserted batch 12
Inserted batch 13
Inserted batch 14
Inserted batch 15
Inserted batch 16
Inserted batch 17
Inserted batch 18
Inserted batch 19
Inserted batch 20
Inserted batch 21
Inserted batch 22
Inserted batch 23
Inserted batch 24
Inserted batch 25
Inserted batch 26
Inserted batch 27
Inserted batch 28
Inserted batch 29
Inserted batch 30
Inserted batch 31
Inserted batch 32
Inserted batch 33
Inserted batch 34
Inserted batch 35
Inserted batch 36
Inserted batch 37
Inserted batch 38
Inserted batch 39
Inserted batch 40
Inserted batch 41
Inserted batch 42
Inserted batch 43
Inserted batch 44
Inserted batch 45
Inserted batch 46
Inserted batch 47
Inserted batch 48
Inserted batch 49
Inserted batch 50
Inserted batch 51
Inserted batch 52
Inserted batch 53
Inserted batch 54
Inserted batch 55
Inserted batch 56
I

# Delete collection if you want to re-do the embeddings training. 

In [None]:
chroma_client.delete_collection(name="best_movies_database")