In [1]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
files = pd.read_csv("course_descriptions.csv", encoding = "ANSI")

In [3]:
def create_course_description(row):
    return f'''The course name is {row["course_name"]}, the slug is {row["course_slug"]},
            the technology is {row["course_technology"]} and the course topic is {row["course_topic"]}'''

In [4]:
pd.set_option('display.max_rows', 106)
files['course_description_new'] = files.apply(create_course_description, axis = 1)
print(files["course_description_new"])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
5      The course name is Data Cleaning and Preproces...
6      The course name is Introduction to Business An...
7      The course name is Data Analysis with Excel Pi...
8      The course name is SQL, the slug is sql,\n    ...
9      The course name is Credit Risk Modeling in Pyt...
10     The course name is Python Programmer Bootcamp,...
11     The course name is SQL + Tableau + Python, the...
12     The course name is Introduction to Jupyter, th...
13     The course name is Statistics, the slug is sta...
14     The course name is Mathematics, the slug is ma...
15     The course name is Introduction to Excel, the ...
16     The course name is Probability, the slug is pr...
17     The course name is Start

In [5]:
# %load_ext dotenv
# %dotenv

In [6]:
load_dotenv(find_dotenv(), override = True)

True

In [7]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [8]:
index_name = "my-index"
dimension = 384
metric = "cosine"

In [9]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succesfully deleted.")
else:
     print(f"{index_name} not in index list.")

my-index succesfully deleted.


In [10]:
pc.create_index(
    name = index_name, 
    dimension = dimension, 
    metric = metric, 
    spec = ServerlessSpec(
        cloud = "aws", 
        region = "us-east-1")
    )

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-9ukxalk.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "access-control-allow-origin": "*",
            "vary": "access-control-request-headers",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-10",
            "x-cloud-trace-context": "3

In [11]:
index = pc.Index(index_name)

## Embedding the data

In [12]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [13]:
def create_embeddings(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding = model.encode(combined_text, show_progress_bar = False)
    return embedding

In [14]:
files["embedding"] = files.apply(create_embeddings, axis = 1)

In [15]:
vectors_to_upsert = [(str(row["course_name"]), row["embedding"].tolist()) for _, row in files.iterrows()]
index.upsert(vectors = vectors_to_upsert)

print("Data upserted to Pinecone index")

Data upserted to Pinecone index


## Semantic search

In [16]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar = False).tolist()

In [21]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12, 
    include_values = True
)

In [22]:
query_results

QueryResponse(matches=[{'id': 'Machine Learning in Excel',
 'score': 0.354953736,
 'values': [-0.0183002315,
            -0.0279485881,
            -0.0253203437,
            -0.0126938531,
            -0.0240366571,
            -0.0219840594,
            -0.051123742,
            -0.0535799824,
            0.009976523,
            0.0282286424,
            -0.0408324972,
            -0.0362686291,
            0.0683277175,
            -0.0348471142,
            -0.00728514744,
            0.036666315,
            -0.00331014814,
            -0.00411818549,
            -4.75488814e-05,
            -0.0627968833,
            0.0846960172,
            0.0300104544,
            -0.0528305136,
            -0.0244681183,
            0.0403409749,
            0.0308714788,
            0.0611491948,
            0.00181211636,
            -0.0445213802,
            -0.0367099494,
            -0.0457854867,
            0.00930389855,
            0.0167219806,
            0.0564011596,
         

In [23]:
for match in query_results["matches"]:
    print(f"Matched item ID: {match['id']}, score: {match['score']}")

Matched item ID: Machine Learning in Excel, score: 0.354953736
Matched item ID: Machine Learning with K-Nearest Neighbors, score: 0.31413886
Matched item ID: Machine Learning in Python, score: 0.282951325
Matched item ID: Customer Churn Analysis with SQL and Tableau, score: 0.281316757
Matched item ID: Growth Analysis with SQL, Python, and Tableau  , score: 0.259746522
Matched item ID: Linear Algebra and Feature Selection, score: 0.259037
Matched item ID: Customer Engagement Analysis with SQL and Tableau, score: 0.234294862
Matched item ID: Fashion Analytics with Tableau, score: 0.233250618
Matched item ID: Machine Learning with Naive Bayes, score: 0.22774601
Matched item ID: Machine Learning with Support Vector Machines, score: 0.225615472
Matched item ID: Data Preprocessing with NumPy, score: 0.219470948
Matched item ID: Data Analysis with Excel Pivot Tables, score: 0.216791108


In [24]:
score_threshold = 0.3
for match in query_results["matches"]:
    if match['score'] >= score_threshold:
        print(f"Matched item ID: {match['id']}, score: {match['score']}")

Matched item ID: Machine Learning in Excel, score: 0.354953736
Matched item ID: Machine Learning with K-Nearest Neighbors, score: 0.31413886
