<a href="https://colab.research.google.com/github/J-Gann/medfluencer/blob/main/medfluencer_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medfluencer Indexing

In [1]:
%pip install llama-index
%pip install -U llama-index-embeddings-huggingface
%pip install pinecone
%pip install -U llama-index-vector-stores-pinecone

Collecting llama-index
  Downloading llama_index-0.10.55-py3-none-any.whl (6.8 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama-index)
  Downloading llama_index_agent_openai-0.2.8-py3-none-any.whl (13 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_cli-0.1.12-py3-none-any.whl (26 kB)
Collecting llama-index-core==0.10.55 (from llama-index)
  Downloading llama_index_core-0.10.55-py3-none-any.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama-index)
  Downloading llama_index_embeddings_openai-0.1.10-py3-none-any.whl (6.2 kB)
Collecting llama-index-indices-managed-llama-cloud>=0.2.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.2.5-py3-none-any.whl (9.3 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_le

In [2]:
import json
from llama_index.core.schema import TextNode
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from pinecone import Pinecone, ServerlessSpec
import pinecone
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.vector_stores.pinecone import PineconeVectorStore
from google.colab import userdata
import numpy as np

PINECONE_API_KEY = userdata.get("PINECONE_API_KEY")
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')

In [24]:
!git clone https://$GITHUB_TOKEN@github.com/J-Gann/medfluencer.git

Cloning into 'medfluencer'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 64 (delta 16), reused 52 (delta 8), pack-reused 0[K
Receiving objects: 100% (64/64), 8.48 MiB | 5.00 MiB/s, done.
Resolving deltas: 100% (16/16), done.
Filtering content: 100% (23/23), 8.30 GiB | 55.10 MiB/s, done.


In [4]:
embed_model = HuggingFaceEmbedding(
    model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer"
)



config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

## Videos

In [5]:
with open("medfluencer/scraping/videos_scraping.json", "r") as f:
    videos_data = json.load(f)

In [6]:
processed_video_data = []
processed_video_ids = []
for video_id, video in videos_data.items():
   if video["transcription"] is None:
       continue
   if video["description"] is None:
       continue
   if video["title"] is None:
       continue
   processed_video_ids.append(video_id)
   processed_video_data.append(video)

In [7]:
video_transcriptions = [video["transcription"] for video in processed_video_data]
video_descriptions = [video["description"] for video in processed_video_data]
video_titles = [video["title"] for video in processed_video_data]

In [None]:
import concurrent.futures

def get_video_description_embeddings():
   return embed_model.get_text_embedding_batch(video_descriptions, show_progress=True)

def get_video_title_embeddings():
   return embed_model.get_text_embedding_batch(video_titles, show_progress=True)

def get_video_transcription_embeddings():
   return embed_model.get_text_embedding_batch(video_transcriptions, show_progress=True)

with concurrent.futures.ThreadPoolExecutor() as executor:
   future_description = executor.submit(get_video_description_embeddings)
   future_title = executor.submit(get_video_title_embeddings)
   future_transcription = executor.submit(get_video_transcription_embeddings)
   video_description_embeddings = future_description.result()
   video_title_embeddings = future_title.result()
   video_transcription_embeddings = future_transcription.result()

video_description_embeddings = np.array(video_description_embeddings)
video_transcription_embeddings = np.array(video_transcription_embeddings)
video_title_embeddings = np.array(video_title_embeddings)
processed_video_ids = np.array(processed_video_ids)

In [None]:
with open("medfluencer/embeddings/video_description_embeddings.npy", "wb") as f:
   np.save(f, video_description_embeddings)

with open("medfluencer/embeddings/video_transcription_embeddings.npy", "wb") as f:
   np.save(f, video_transcription_embeddings)

with open("medfluencer/embeddings/video_title_embeddings.npy", "wb") as f:
   np.save(f, video_title_embeddings)

with open("medfluencer/embeddings/video_ids.npy", "wb") as f:
   np.save(f, processed_video_ids)

### Transcription Chunking

In [8]:
splitter = SentenceSplitter(
    chunk_size=256,
    chunk_overlap=20,
)

In [9]:
video_transcription_chunks = splitter.get_nodes_from_documents([TextNode(text=video["transcription"]) for video in processed_video_data], show_progress=True)

Parsing nodes:   0%|          | 0/84922 [00:00<?, ?it/s]

In [10]:
video_transcription_chunks_text = [chunk.text for chunk in video_transcription_chunks]

In [26]:
with open("medfluencer/embeddings/video_transcription_chunks.json", "w") as f:
   json.dump(video_transcription_chunks_text, f)

In [11]:
video_transcription_chunks_embeddings = embed_model.get_text_embedding_batch(video_transcription_chunks_text, show_progress=True)

Generating embeddings:   0%|          | 0/787115 [00:00<?, ?it/s]

In [25]:
num_parts = 3
split_arrays = np.array_split(video_transcription_chunks_embeddings, num_parts)
filenames = []

for i, split_array in enumerate(split_arrays):
   filename = f"medfluencer/embeddings/video_transcription_chunks_embeddings_part_{i + 1}.npy"
   with open(filename, "wb") as f:
       np.save(f, split_array)

   filenames.append(filename)

## Comments

In [None]:
with open("medfluencer/scraping/comments_scraping.json", "r") as f:
    comments_data = json.load(f)

In [None]:
comment_texts = [comment["text"] for comment_id, comment in comments_data.items()]

In [None]:
comment_embeddings = embed_model.get_text_embedding_batch(
    comment_texts, show_progress=True
)

In [None]:
num_parts = 4
split_arrays = np.array_split(comment_embeddings, num_parts)
filenames = []

for i, split_array in enumerate(split_arrays):
   filename = f"medfluencer/embeddings/comment_embeddings_part_{i + 1}.npy"
   with open(filename, "wb") as f:
       np.save(f, split_array)

   filenames.append(filename)