In [8]:
%load_ext dotenv
%dotenv

In [1]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
files = pd.read_csv("course_section_descriptions.csv", encoding = "ANSI")

In [3]:
files["unique_id"] = files["course_id"].astype(str) + '-' + files["section_id"].astype(str)

In [4]:
files["metadata"] = files.apply(lambda row: {
    "course_name": row["course_name"],
    "section_name": row["section_name"],
    "section_description": row["section_description"],
}, axis = 1)

In [5]:
def create_embeddings(row):
    combined_text = f'''{row["course_name"]} {row["course_technology"]}
                        {row["course_description"]} {row["section_name"]}{row["section_description"]}'''
    return model.encode(combined_text, show_progress_bar = False)

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
files["embedding"] = files.apply(create_embeddings, axis = 1)

## Upserting data to Pinecone

In [9]:
load_dotenv(find_dotenv(), override = True)

True

In [10]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [11]:
index_name = "my-index"
dimension = 384
metric = "cosine"

In [12]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succesfully deleted.")
else:
     print(f"{index_name} not in index list.")

my-index succesfully deleted.


In [13]:
pc.create_index(
    name = index_name, 
    dimension = dimension, 
    metric = metric, 
    spec = ServerlessSpec(
        cloud = "aws", 
        region = "us-east-1")
    )

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-9ukxalk.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "access-control-allow-origin": "*",
            "vary": "access-control-request-headers",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-10",
            "x-cloud-trace-context": "6

In [14]:
index = pc.Index(index_name)

In [24]:
vectors_to_upsert = [(row["unique_id"], row["embedding"].tolist(), row["metadata"]) for index, row in files.iterrows()]

In [25]:
index.upsert(vectors = vectors_to_upsert)
print("Data succesfully upserted to Pinecone index")

Data succesfully upserted to Pinecone index


In [26]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [27]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_metadata=True
)

In [28]:
score_threshold = 0.3

In [29]:
# Assuming query_results are fetched and include metadata
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/A')
        section_name = course_details.get('section_name', 'N/A')
        section_description = course_details.get('section_description', 'No description available')
        
        print(f"Matched item ID: {match['id']}, Score: {match['score']}")
        print(f"Course: {course_name} \nSection: {section_name} \nDescription: {section_description}")

Matched item ID: 51-469, Score: 0.561393678
Course: Machine Learning in Excel 
Section: Cluster Analysis 
Description: Cluster analysis is the most intuitive and important example of unsupervised learning. However, to be able to understand cluster analysis, you must first become familiar with the mathematics behind it. Here we will explore the fundamentals of cluster analysis and have a look at the differences between clustering and classification.
Matched item ID: 37-374, Score: 0.543558061
Course: Machine Learning in Python 
Section: Other Types of Clustering 
Description: In previous sections, we focus extensively on k-means clustering, as it is the fastest and most efficient method for clustering. In this section, we explore other approaches that are less common.
Matched item ID: 51-470, Score: 0.509141862
Course: Machine Learning in Excel 
Section: K-means Clustering 
Description: Master K-means clustering in Excel by learning how to choose the number of clusters in your analysis 