# Example of using Pinecode as embeddings DB for Neural Search of Youtube videos

## Load libraries and setup 3rd party applications

In [1]:
from googleapiclient.discovery import build # pip install --upgrade google-api-python-client
from pathlib import Path
import json
import requests
import urllib
from os.path import exists
import os

# !sudo apt update -y && sudo apt install ffmpeg -y
#!pip install --upgrade protobuf
#!pip install --upgrade tensorflow
#!pip uninstall keras
#!pip install --upgrade keras
from pytube import YouTube  # !pip install pytube
from pytube.exceptions import RegexMatchError
import scrapetube
import youtube_dl

from tqdm.auto import tqdm  # !pip install tqdm

import whisper # !pip install git+https://github.com/openai/whisper.git
import torch  # pytorch install steps: pytorch.org

import pinecone # pip install --upgrade pinecone-client
from sentence_transformers import SentenceTransformer

2023-01-22 10:36:59.638155: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-22 10:37:00.244643: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-22 10:37:00.244740: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


## Define Inputs
channel_names is a list of youtube channel names to index<br>
pinecone_api_key and google_api_key requires free registration

In [5]:
audio_dir = './audio'
channel_names = ['paulharveyarchives']
pinecone_api_key = 'c65fa925-08e1-4af0-b08b-1104c6ffba25' # https://app.pinecone.io/projects
google_api_key = 'AIzaSyAIIY6OsTws8dTfoyxNmJLmnfmH2f859Fw' # https://console.cloud.google.com/apis/dashboard

youtube_dl_options = {
    'skip_download': True,
    'ignoreerrors': True
}

# name of Pinecone index to use
index_id = "audio"
# we encode and insert in batches of 64
batch_size = 64

## Helper Functions

In [3]:
def get_all_video_in_channel(channel_id):
    video_IDs = []
    video_titles = {}
    videos = scrapetube.get_channel(channel_id)

    try:
        for video in videos:
            video_IDs.append(video['videoId'])
            video_titles[video['videoId']] = video['title']['runs'][0]['text']
    except:
        with youtube_dl.YoutubeDL(youtube_dl_options) as ydl:
            videos = ydl.extract_info(f'https://www.youtube.com/user/{channel_id}/videos')
        for item in videos['entries']:
            video_IDs.append(item['id'])
            video_titles[item['id']] = item['title']
            
    if len(video_IDs) == 0:
        with youtube_dl.YoutubeDL(youtube_dl_options) as ydl:
            videos = ydl.extract_info(f'https://www.youtube.com/channel/{channel_id}')
        for item in videos['entries']:
            video_IDs.append(item['id'])
            video_titles[item['id']] = item['title']
        
    return video_IDs, video_titles
        
def get_youtube_channel_id(channel_name):
    try:
        youtube = build('youtube', 'v3', developerKey=google_api_key)
        channels_response = youtube.channels().list(
                forUsername=channel_name,
                part="id, snippet, statistics, contentDetails, topicDetails"
        ).execute()
        response = channels_response['items'][0]['id']
        return response
    except:
        return channel_name

def save_audio_from_videoIDs(save_path, video_IDs):
    for videoID in tqdm(video_IDs):
        check_file = f"{save_path}/{videoID}.mp3"
        if exists(check_file):
            continue

        # url of video to be downloaded
        url = f"https://youtu.be/{videoID}"

        # try to create a YouTube vid object
        try:
            yt = YouTube(url)
        except RegexMatchError:
            print(f"RegexMatchError for '{url}'")
            continue

        itag = None
        # we only want audio files
        files = yt.streams.filter(only_audio=True)
        for file in files:
            # and of those audio files we grab the first audio for mp4 (eg mp3)
            if file.mime_type == 'audio/mp4':
                itag = file.itag
                break
        if itag is None:
            # just incase no MP3 audio is found (shouldn't happen)
            print("NO MP3 AUDIO FOUND")
            continue

        # get the correct mp3 'stream'
        stream = yt.streams.get_by_itag(itag)
        # downloading the audio
        try:
            # only download mp3 if it does not exist
            stream.download(output_path=save_path, filename=f"{videoID}.mp3")
        except:
            print(f"error downloading audio for video ID {videoID}")
            
def get_text_from_data(start, end, data):
    text = ""
    for i in range(start,end):
        text += data[i]['text']+' '
    return text

## Initialize Whisper Model for transcription and Pinecone index
Use GPU if available

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model = whisper.load_model("small").to(device)

model_id = "multi-qa-mpnet-base-dot-v1"
model_embed = SentenceTransformer(model_id)
dim = model_embed.get_sentence_embedding_dimension()

pinecone.init(
    api_key=pinecone_api_key,  # app.pinecone.io
    environment="us-west1-gcp"
)
if index_id not in pinecone.list_indexes():
    pinecone.create_index(
        index_id,
        dim,
        metric="dotproduct"
    )

index = pinecone.Index(index_id)
index.describe_index_stats()

cuda


{'dimension': 768,
 'index_fullness': 0.4,
 'namespaces': {'': {'vector_count': 805171}},
 'total_vector_count': 805171}

## Create embeddings from youtube channel
This takes a while to run as is it downloads and transcribes every video in the youtube channel<br>
If videos do not download, try alternative methods as youtube channels are not setup consistently

In [7]:
for channel_name in channel_names:
    try:
        print(channel_name)
        channel_id = get_youtube_channel_id(channel_name)
        video_IDs, video_titles = get_all_video_in_channel(channel_id)
        print(video_IDs)
        save_audio_from_videoIDs(audio_dir, video_IDs)
    except:
        continue

    # get list of MP3 audio files
    paths = [str(x) for x in Path(audio_dir).glob('*.mp3')]
    
    transcriptions = []
    for i, path in enumerate(tqdm(paths)):
        _id = path.split('/')[-1][:-4]
        # transcribe to get speech-to-text data
        print(path)
        try:
            result = model.transcribe(path)
        except:
            print('error, removing file ',path)
            os.remove(path)
            continue
        # add results to data list
        transcriptions.extend(result['segments'])
        
        # set window (length of text chunk) and stride
        window = 1
        stride = 1  # smaller stride creates overlap
        
        data = []
        results = []
    
        with open("transcription.jsonl", "w", encoding="utf-8") as fp:
            _id = path.split('/')[-1][:-4]
            # transcribe to get speech-to-text data
            result = model.transcribe(path)
            segments = result['segments']
            for j in range(0, len(segments), stride):
                j_end = min(j+window, len(segments)-1)
                text = ''.join([x["text"] for x in segments[j:j_end]])
                start = segments[j]['start']
                end = segments[j_end]['end']
                row_id = f"{_id}-t{segments[j]['start']}"
                meta = {
                    **{
                        "id": row_id,
                        "text": text.strip(),
                        "start": start,
                        "end": end,
                        "url": f"https://youtu.be/{_id}",
                        "name":_id,
                        "title":video_titles[_id]
                    }
                }
                data.append(meta)
                json.dump(meta, fp)
                fp.write('\n')
                    
        new_data = []
        
        window = 6  # number of sentences to combine
        stride = 3  # number of sentences to 'stride' over, used to create overlap
        
        for i in tqdm(range(0, len(data), stride)):
            i_end = min(len(data)-1, i+window)
            if data[i]['name'] != data[i_end]['name']:
                # in this case we skip this entry as we have start/end of two videos
                continue
            text = get_text_from_data(i, i_end, data)
            new_data.append({
                'start': data[i]['start'],
                'end': data[i_end]['end'],
                'text': text,
                'id': data[i]['id'],
                'url': data[i]['url']+'?t='+str(int(data[i]['start'])),
                "name":data[i]['name'],
                "title":data[i]['title'],
            })
            
        # loop through in batches of 64
        index = pinecone.Index(index_id)
        for j in tqdm(range(0, len(new_data), batch_size)):
            # find end position of batch (for when we hit end of data)
            j_end = min(len(new_data)-1, j+batch_size)
            # extract the metadata like text, start/end positions, etc
            batch_meta = [{
                "text": new_data[x]["text"],
                "start": new_data[x]["start"],
                "end": new_data[x]["end"],
                "url": new_data[x]["url"],
                "name": new_data[x]["name"],
                "title": new_data[x]["title"]
            } for x in range(j, j_end)]
            # extract only text to be encoded by embedding model
            batch_text = [row['text'] for row in new_data[j:j_end]]
            # create the embedding vectors
            batch_embeds = model_embed.encode(batch_text).tolist()
            # extract IDs to be attached to each embedding and metadata
            batch_ids = [row['id'] for row in new_data[j:j_end]]
            # 'upsert' (eg insert) IDs, embeddings, and metadata to index
            try:
                to_upsert = list(zip(batch_ids, batch_embeds, batch_meta))
                index.upsert(to_upsert)
            except:
                continue
        print('removing file ',path)
        os.remove(path)

paulharveyarchives
[youtube:tab] UCiUcRrTAi6BBy3HqrA5543Q: Downloading webpage


ERROR: Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: 'Not Found'>); please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see  https://yt-dl.org/update  on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output.


## Query pinecone index for answer to question with video link in URL

In [10]:
# Define the query or question to ask
query = "what is OpenAI's CLIP?"
# Create and embedding representing the question
xq = model_embed.encode(query).tolist()
# Search the index for the top (k) answers 
results = index.query(xq, top_k=5, include_metadata=True)
print(results)

{'matches': [{'id': 'My-I-6P_VUs-t676.68',
              'metadata': {'end': 717.04,
                           'name': 'My-I-6P_VUs',
                           'start': 676.68,
                           'text': "using clip, what they're doing is they're "
                                   'using clip, which is a, you know, a model '
                                   'that open AI released that they open AI '
                                   'did not release sort of the generator '
                                   'architecture on top of it. So the '
                                   'community has, has kind of taken that and '
                                   'applied another generator called DQ GAN. '
                                   "And so it's not even just taking one model "
                                   "and changing the data set you're working "
                                   "on, it's taking two models and then "
                                   'recomp