In [1]:
!pip install pinecone



In [2]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

In [9]:
files = pd.read_csv("course_descriptions.csv", encoding = "latin1")

In [10]:
def create_course_description(row):
    return f'''The course name is {row["course_name"]}, the slug is {row["course_slug"]},
            the technology is {row["course_technology"]} and the course topic is {row["course_topic"]}'''

In [11]:
pd.set_option('display.max_rows', 106)
files['course_description_new'] = files.apply(create_course_description, axis = 1)
print(files["course_description_new"])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
5      The course name is Data Cleaning and Preproces...
6      The course name is Introduction to Business An...
7      The course name is Data Analysis with Excel Pi...
8      The course name is SQL, the slug is sql,\n    ...
9      The course name is Credit Risk Modeling in Pyt...
10     The course name is Python Programmer Bootcamp,...
11     The course name is SQL + Tableau + Python, the...
12     The course name is Introduction to Jupyter, th...
13     The course name is Statistics, the slug is sta...
14     The course name is Mathematics, the slug is ma...
15     The course name is Introduction to Excel, the ...
16     The course name is Probability, the slug is pr...
17     The course name is Start

In [12]:
from google.colab import userdata

pc = Pinecone(api_key = userdata.get("PINECONE_API_KEY"))

In [13]:
index_name = "my-index"
dimension = 768
metric = "cosine"

In [14]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succesfully deleted.")
else:
     print(f"{index_name} not in index list.")

my-index succesfully deleted.


In [15]:
pc.create_index(
    name = index_name,
    dimension = dimension,
    metric = metric,
    spec = ServerlessSpec(
        cloud = "aws",
        region = "us-east-1")
    )

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-4bhpcp2.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [16]:
index = pc.Index(index_name)

## Embedding the data

In [17]:
#model = SentenceTransformer("all-MiniLM-L6-v2")
model = SentenceTransformer('multi-qa-distilbert-cos-v1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
def create_embeddings(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding = model.encode(combined_text, show_progress_bar = False)
    return embedding

In [19]:
files["embedding"] = files.apply(create_embeddings, axis = 1)

In [20]:
vectors_to_upsert = [(str(row["course_name"]), row["embedding"].tolist()) for _, row in files.iterrows()]
index.upsert(vectors = vectors_to_upsert)

print("Data upserted to Pinecone index")

Data upserted to Pinecone index


In [21]:
query = "clustring"
query_embedding = model.encode(query,show_progress_bar=False).tolist()

In [22]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_values = True
)

In [25]:
query_results

{'matches': [{'id': 'Intro to LLMs',
              'score': 0.0978803709,
              'values': [0.0083390763,
                         0.0380700268,
                         0.00817822199,
                         -0.0194164775,
                         0.076818049,
                         -0.0114381844,
                         -0.0672079846,
                         0.097621806,
                         -0.0136023825,
                         0.0136497431,
                         -0.0534522533,
                         0.0363873392,
                         0.0151731931,
                         -0.0572773702,
                         0.00575776491,
                         -0.0280215889,
                         -0.0159570221,
                         -0.0690936744,
                         0.0343313366,
                         0.0401275307,
                         0.0338963345,
                         0.0227106232,
                         0.0357957669,
                    

In [29]:
score_threshold = 0.3
matches_found = False
print("Scores of all matches:")
for match in query_results["matches"]:
    print(f"Match ID: {match['id']}, score: {match['score']}")
    if match['score'] >= score_threshold:
        print(f"Matched item ID: {match['id']}, score: {match['score']}")
        matches_found = True

if not matches_found:
    print(f"No matches found with a score greater than or equal to {score_threshold}")

Scores of all matches:
Match ID: Intro to LLMs, score: 0.0978803709
Match ID: Machine Learning with K-Nearest Neighbors, score: 0.0929136351
Match ID: Portfolio Management, score: 0.0918493345
Match ID: Introduction to Tableau, score: 0.0903825834
Match ID: Intro to PowerPoint, score: 0.0782938078
Match ID: Machine Learning Deep Dive: Business Applications and Coding Walkthroughs, score: 0.0730037764
Match ID: Fixed Income Investments, score: 0.0676469877
Match ID: Customer Engagement Analysis with SQL and Tableau, score: 0.0645218
Match ID: Machine Learning in Excel, score: 0.0635748
Match ID: Corporate Finance , score: 0.0601606406
Match ID: Technical Analysis, score: 0.0591278151
Match ID: Persuasion and Influence, score: 0.0558786429
No matches found with a score greater than or equal to 0.3
