In [7]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd


def get_youtube_video_transcript_dataframe(youtube_url):
    try:
        # Extract video ID from the URL
        video_id = youtube_url.split("v=")[-1]

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Create an empty list to store the transcript data
        transcript_data = []

        # Loop through each transcript item and extract timestamp and text
        for item in transcript:
            start_time = item["start"]
            end_time = item["start"] + item["duration"]
            text = item["text"]
            transcript_data.append({"Start": start_time, "End": end_time, "Text": text})

        # Convert the list to a pandas DataFrame
        df = pd.DataFrame(transcript_data)

        return df

    except Exception as e:
        print("Error:", e)
        return None

# Replace the following with the YouTube video URL of your choice
youtube_url = "https://www.youtube.com/watch?v=VyFk2sdw230&ab_channel=BiggerPockets"
transcript_df = get_youtube_video_transcript_dataframe(youtube_url)

if transcript_df is not None:
    print(transcript_df.head())
else:
    print("Failed to get the transcript.")

   Start     End                                      Text
0  0.000   5.339  welcome to mortgage Mondays Today's Show
1  2.280   6.720    we are going to break down an FHA loan
2  5.339   8.280     pretty much everything that you could
3  6.720   9.720      possibly need to know but if you now
4  8.280  10.980   show up to an appointment with the loan


# Preprocessed

In [8]:
transcript_df.head()

Unnamed: 0,Start,End,Text
0,0.0,5.339,welcome to mortgage Mondays Today's Show
1,2.28,6.72,we are going to break down an FHA loan
2,5.339,8.28,pretty much everything that you could
3,6.72,9.72,possibly need to know but if you now
4,8.28,10.98,show up to an appointment with the loan


In [9]:
transcript_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Start   517 non-null    float64
 1   End     517 non-null    float64
 2   Text    517 non-null    object 
dtypes: float64(2), object(1)
memory usage: 12.2+ KB


# Sentiment Analysis

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.functional import softmax

data = transcript_df

model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = BertTokenizer.from_pretrained(model_name)


# Preprocess text and tokenize
def preprocess_text(text):
    return " ".join(text.split())  # Remove extra spaces

data['Text'] = data['Text'].apply(preprocess_text)
inputs = tokenizer(data['Text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Perform sentiment analysis
with torch.no_grad():
    outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1).numpy()  # Convert logits to probabilities

# Add sentiment labels and probabilities to the dataframe
sentiment_labels = ['negative', 'neutral', 'positive']
data['Sentiment'] = [sentiment_labels[p.argmax()] for p in probs]
data[['NegativeProb', 'NeutralProb', 'PositiveProb']] = pd.DataFrame(probs, columns=sentiment_labels)

# Save the modified dataset
data.to_csv('sentiment_analysis_results.csv', index=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
data.head()

Unnamed: 0,Start,End,Text,Sentiment,NegativeProb,NeutralProb,PositiveProb
0,0.0,5.339,welcome to mortgage Mondays Today's Show,negative,0.501745,0.219033,0.279222
1,2.28,6.72,we are going to break down an FHA loan,negative,0.468423,0.231924,0.299653
2,5.339,8.28,pretty much everything that you could,negative,0.428133,0.217805,0.354062
3,6.72,9.72,possibly need to know but if you now,negative,0.420645,0.250991,0.328365
4,8.28,10.98,show up to an appointment with the loan,negative,0.474714,0.224691,0.300596


In [None]:
# ---------------------------------------------example

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import BartTokenizer, BartForConditionalGeneration
from moviepy.editor import VideoFileClip, concatenate_videoclips

# Initialize the sentiment analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Load a pre-trained model for summarization
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

def analyze_sentiment(text):
    """
    Analyze the sentiment of a text and return the sentiment score.
    """
    sentiment_score = sia.polarity_scores(text)['compound']
    return sentiment_score

def filter_sentences_by_sentiment(transcript, threshold=0.2):
    """
    Filter sentences from the transcript based on a sentiment threshold.
    It filters both positive and negative sentences and leaves out neutral ones.
    """
    significant_sentences = [
        (start, end, sentence) for start, end, sentence in transcript
        if abs(analyze_sentiment(sentence)) >= threshold
    ]
    return significant_sentences

# ... (Other functions remain the same)

# Sample YouTube transcript with timestamps (start_time, end_time, sentence)
transcript = [
    (0, 5, "This is an amazing discovery."),
    (6, 10, "It changes everything we knew about space."),
    (11, 20, "Regular maintenance is scheduled for next week."),
    (21, 30, "Breaking news about the situation unfolding downtown.")
]

# Filter sentences by sentiment (excluding neutral in this case)
filtered_transcript = filter_sentences_by_sentiment(transcript, threshold=0.2)

# Generate a summary of the filtered transcript
transcript_text = ' '.join(sentence for _, _, sentence in filtered_transcript)
summary = summarize_text(transcript_text)

# Extract timestamps corresponding to the summary
timestamps = extract_timestamps(summary, filtered_transcript)

# Text Summarization

# Result