In [1]:
!pip install fastapi nest-asyncio pyngrok uvicorn
!pip install youtube_transcript_api
!pip install sentence-transformers

Collecting pyngrok
  Downloading pyngrok-7.2.7-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.7-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.7
Collecting youtube_transcript_api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube_transcript_api
Successfully installed youtube_transcript_api-1.0.3
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed senten

In [7]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
from googleapiclient.discovery import build
from sentence_transformers import SentenceTransformer, util
from textblob import TextBlob
from transformers import AutoTokenizer
import tensorflow as tf 
import pandas as pd
import torch
import re
from tqdm import tqdm, trange


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
loaded_question_model = tf.saved_model.load("/kaggle/input/distilbert_question_model/other/distilbert_question_model_v1/1/kaggle/working/distilbert_question_model")
loaded_suggestion_model = tf.saved_model.load("/kaggle/input/distilbert_suggestion_model/other/distilbert_suggestion_model_v1/1/kaggle/working/distilbert_suggestion_model")
inferQ = loaded_question_model.signatures["serving_default"]
inferS = loaded_suggestion_model.signatures["serving_default"]


def classify_comments_into_question(comments_list, batch_size=20):
    all_results = []
    for i in range(0, len(comments_list), batch_size):
        batch = comments_list[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="tf")
        outputs = inferQ(**inputs)
        logits = outputs['logits'].numpy()
        logits_with_labels = [list(zip(*sorted(zip(logit, ["other", "question"]), reverse=True))) for logit in logits]
        results = [{"labels": labels, "score":logits} for logits, labels in logits_with_labels]
        all_results.extend([result['labels'][0] for result in results])
    return all_results

def classify_comments_into_suggestion(comments_list, batch_size=20):
    all_results = []
    for i in range(0, len(comments_list), batch_size):
        batch = comments_list[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="tf")
        outputs = inferS(**inputs)
        logits = outputs['logits'].numpy()
        logits_with_labels = [list(zip(*sorted(zip(logit, ["other", "suggestion"]), reverse=True))) for logit in logits]
        results = [{"labels": labels, "score":logits} for logits, labels in logits_with_labels]
        all_results.extend([result['labels'][0] for result in results])
    return all_results


    

    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # if no gpu check use cpu
model = SentenceTransformer('all-MiniLM-L6-v2', device=device) # Load pre-trained Sentence-BERT model
# model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4', device=device) # Load pre-trained Sentence-BERT model

def get_related_comment_in_context(comments:list, context:str): # return score in list (same order as taken)

    # Encode the context and comments
    context_embedding = model.encode(context, convert_to_tensor=True) # take str
    comment_embeddings = model.encode(comments, convert_to_tensor=True) # take list

    # Compute similarity scores
    similarity_scores = util.pytorch_cos_sim(context_embedding, comment_embeddings)[0]

    return [score.item() for score in similarity_scores] # convert tensor score into a numerical scaler using item() func on each tensor score


def get_comments_sentiment(comments):
    Comment_df = pd.DataFrame()

    polarity = []
    for comment in comments:
        blob = TextBlob(comment)
        polarity.append(round(blob.sentiment.polarity,3))
    Comment_df['polarity'] = polarity


    sentiment = []
    for i in range(len(Comment_df['polarity'])):
        if Comment_df['polarity'][i] > 0:sentiment.append('Positive')
        elif Comment_df['polarity'][i] < 0:sentiment.append('Negative')
        else:sentiment.append('Neutral')
    Comment_df['sentiment'] = sentiment

    sentiments_dict = Comment_df.to_dict()

    return sentiments_dict

def get_video_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        formatter = TextFormatter()
        text_formatter = formatter.format_transcript(transcript)
        return text_formatter.replace("\n", " ")
    except:
        return False


def get_video_title_descript_comments(video_id):
    api_service_name = "youtube"
    api_version = "v3"
    DEVELOPER_KEY = "AIzaSyAc12zhv5J2zWlL0ENgDtFRKzkkxxAMbB0"
    return_json = {'Title_Description': "", 'Comment_Info': []}

    youtube = build(api_service_name, api_version, developerKey=DEVELOPER_KEY)

    # get youtube Video Title and Description
    video_request = youtube.videos().list( part="snippet", id=video_id )
    video_response = video_request.execute()


    video_details = video_response['items'][0]['snippet']
    video_title = video_details['title']
    video_description = video_details['description']
    title_descript_combo = video_title + " - " + video_description

    return_json['Title_Description'] = title_descript_combo


    # get youtube Video all comments
    next_page_token = None
    while True:
        request = youtube.commentThreads().list( part='snippet', videoId=video_id, textFormat='plainText', maxResults=100, pageToken=next_page_token )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            return_json['Comment_Info'].append({
                "Comment": comment['textDisplay'],
                "Comment_ID": item['snippet']['topLevelComment']['id'],
                "ReplyCount": item['snippet']["totalReplyCount"],
                "likeCount": comment['likeCount'],
        })

        # Check for next page
        if 'nextPageToken' in response:next_page_token = response['nextPageToken']
        else:break

    return return_json


def pre_processing_comments(text):
    text = re.sub(r"[ ]+", " ", text) # Replace multiple spaces with a single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Replace any non-ASCII characters with a space
    text = re.sub(r'[\r\n\t]+', ' ', text) # Replace newlines and tabs with spaces
    text = text.strip() # Remove any leading/trailing whitespace again to clean up any spaces added by the previous replacements
    return text





In [57]:
transcript_cache = {}
details_cache = {}

def get_cached_video_transcript(video_id):
    if video_id not in transcript_cache:
        transcript_cache[video_id] = get_video_transcript(video_id)
    return transcript_cache[video_id]

def get_cached_video_details(video_id):
    if video_id not in details_cache:
        details_cache[video_id] = get_video_title_descript_comments(video_id)
    return details_cache[video_id]

########################################################################################################


def get_top_related_comments(video_id):
    video_transcription = get_cached_video_transcript(video_id)
    custom_video_details = get_cached_video_details(video_id)

    context = video_transcription or custom_video_details["Title_Description"]
    
    df = pd.DataFrame(custom_video_details["Comment_Info"])
    df['Clean_Comments'] = df['Comment'].apply(pre_processing_comments)
    comments_list = [str(comment) if pd.notna(comment) and comment.strip() else "Empty Comment" for comment in df['Clean_Comments'].to_list()]
    
    top_related_score = get_related_comment_in_context(comments_list, context)
    df["top_related_score"] = top_related_score
    sorted_df = df.sort_values(by="top_related_score", ascending=False)
    
    # print(sorted_df[["Clean_Comments", "top_related_score"]])
    return json.loads( sorted_df[["Clean_Comments", "top_related_score"]].to_json(orient="records") )


def get_question_comments(video_id):
    custom_video_details = get_cached_video_details(video_id)
    
    df = pd.DataFrame(custom_video_details["Comment_Info"])
    df['Clean_Comments'] = df['Comment'].apply(pre_processing_comments)
    comments_list = [str(comment) if pd.notna(comment) and comment.strip() else "Empty Comment" for comment in df['Clean_Comments'].to_list()]
    
    classified = classify_comments_into_question(comments_list)
    df["Classification"] = classified
    # print(df[["Clean_Comments", "Classification"]])
    
    return json.loads( df[df["Classification"] == "question"][["Clean_Comments", "Classification"]].to_json(orient="records") )


def get_suggestion_comments(video_id):
    custom_video_details = get_cached_video_details(video_id)
    
    df = pd.DataFrame(custom_video_details["Comment_Info"])
    df['Clean_Comments'] = df['Comment'].apply(pre_processing_comments)
    comments_list = [str(comment) if pd.notna(comment) and comment.strip() else "Empty Comment" for comment in df['Clean_Comments'].to_list()]
    
    classified = classify_comments_into_suggestion(comments_list)
    df["Classification"] = classified

    # print( df[df["Classification"] == "suggestion"][["Clean_Comments", "Classification"]] ) # get only Question
    return json.loads( df[df["Classification"] == "suggestion"][["Clean_Comments", "Classification"]].to_json(orient="records") )



def get_comments_sentiments(video_id):
    custom_video_details = get_cached_video_details(video_id)
    
    df = pd.DataFrame(custom_video_details["Comment_Info"])
    df['Clean_Comments'] = df['Comment'].apply(pre_processing_comments)
    comments_list = [str(comment) if pd.notna(comment) and comment.strip() else "Empty Comment" for comment in df['Clean_Comments'].to_list()]


    score = get_comments_sentiment(comments_list)
    df["polarity"] = score["polarity"]
    df["sentiment"] = score["sentiment"]

    # print( df[["Clean_Comments", "sentiment"]] ) # get only Question
    return json.loads( df[["Clean_Comments", "sentiment"]].to_json(orient="records") )
    

# top_comments = get_top_related_comments("K5KVEU3aaeQ")
# print(top_comments)

# question = get_question_comments("K5KVEU3aaeQ")
# print(question)

# suggestion = get_suggestion_comments("K5KVEU3aaeQ")
# print(suggestion)

# sentiments = get_comments_sentiments("K5KVEU3aaeQ")
# print(sentiments)


In [58]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import json, random

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

@app.get('/')
async def root():
    return {'hello': 'world'}

@app.get("/top_comments/{video_id}")
async def read_item(video_id: str):
    return get_top_related_comments(video_id)

@app.get("/questions/{video_id}")
async def read_item(video_id: str):
    return get_question_comments(video_id)

@app.get("/suggestions/{video_id}")
async def read_item(video_id: str):
    return get_suggestion_comments(video_id)

@app.get("/sentiments/{video_id}")
async def read_item(video_id: str):
    return get_comments_sentiments(video_id)


In [None]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn

ngrok.set_auth_token("2jmWNT3zhjP0KvXMuCl6uilFKqP_7B9oSR3kwCDEyYzzrry9m")
ngrok_tunnel = ngrok.connect(8000, domain="flexible-subtly-tomcat.ngrok-free.app")

print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

# !ngrok http 8000 --domain "kind-shortly-gibbon.ngrok-free.app"

Public URL: https://flexible-subtly-tomcat.ngrok-free.app


INFO:     Started server process [30]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     64.227.21.251:0 - "GET /questions/K5KVEU3aaeQ HTTP/1.1" 200 OK
