# Step 1: Download YouTube Video
Use yt-dlp to download the video or extract audio directly.

In [24]:
import yt_dlp

def download_youtube_video(video_url, output_path='video.mp4'):
    ydl_opts = {'format': 'bestaudio/best', 'outtmpl': output_path}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

# Step 2: Extract Audio and Transcribe
You can extract audio from the video and use whisper to transcribe it.

In [25]:
import speech_recognition as sr
from pydub import AudioSegment

def mp4_to_wav(mp4_path, output_path):
    # Convert MP4 to WAV
    sound = AudioSegment.from_file(mp4_path)
    sound.export(output_path, format="wav")
    return output_path

def transcribe_audio(audio_path):

    # Transcribe audio
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    text = recognizer.recognize_google(audio)
    
    return text

# Step 3: Label the Transcriptions
After transcribing the video, you can manually label them as positive or negative.

In [26]:
import pandas as pd

def label_transcription(video_name, transcription, label):
    data = {'video_name': video_name, 'transcription': transcription, 'label': label}
    df = pd.DataFrame([data])
    
    return df

# Step 4: Automate the Process
You can automate the entire process for multiple videos.

In [27]:
def process_youtube_videos(video_urls, labels):
    all_transcriptions = []

    for idx, video_url in enumerate(video_urls):
        video_name = f'Youtube Videos/video_{idx}.mp4'
        download_youtube_video(video_url, output_path=video_name)

        audio_name = f'Audios/video_{idx}.wav'
        mp4_to_wav(video_name, output_path=audio_name)

        transcription = transcribe_audio(audio_name)
        df = label_transcription(video_name, transcription, labels[idx])
        all_transcriptions.append(df)

    result_df = pd.concat(all_transcriptions, ignore_index=True)
    return result_df


# Step 5: Youtube Video URLs
Putting youtube video URLs and manually labelling them positive and negative.

In [6]:
video_urls = [
    "https://www.youtube.com/watch?v=1aA1WGON49E",
    "https://www.youtube.com/watch?v=m2bn3SEfPmI",
    "https://www.youtube.com/watch?v=lzILoMjEpaE",
    "https://www.youtube.com/watch?v=5c3wWNsmLA0",
    "https://www.youtube.com/watch?v=uWRmBjFxttc",
    "https://www.youtube.com/watch?v=9zBCeS7wf7w",
    # Add more URLs here upto more than 50
]

labels = ['positive', 'negative', 'positive', 'negative', 'positive', 'negative']  # Manually assigned labels

result_df = process_youtube_videos(video_urls, labels)
print(result_df)


[youtube] Extracting URL: https://www.youtube.com/watch?v=1aA1WGON49E
[youtube] 1aA1WGON49E: Downloading webpage
[youtube] 1aA1WGON49E: Downloading ios player API JSON
[youtube] 1aA1WGON49E: Downloading web creator player API JSON
[youtube] 1aA1WGON49E: Downloading m3u8 information
[info] 1aA1WGON49E: Downloading 1 format(s): 251
[download] Destination: video_0.mp4
[download] 100% of    1.02MiB in 00:00:00 at 2.49MiB/s   
[youtube] Extracting URL: https://www.youtube.com/watch?v=m2bn3SEfPmI
[youtube] m2bn3SEfPmI: Downloading webpage
[youtube] m2bn3SEfPmI: Downloading ios player API JSON
[youtube] m2bn3SEfPmI: Downloading web creator player API JSON
[youtube] m2bn3SEfPmI: Downloading m3u8 information
[info] m2bn3SEfPmI: Downloading 1 format(s): 251
[download] Destination: video_1.mp4
[download] 100% of    1.30MiB in 00:00:02 at 667.50KiB/s 
[youtube] Extracting URL: https://www.youtube.com/watch?v=lzILoMjEpaE
[youtube] lzILoMjEpaE: Downloading webpage
[youtube] lzILoMjEpaE: Downloading 

# Step 6: Saving the Data
You can save the labeled transcriptions to a CSV file for further analysis.

In [9]:
result_df.to_csv('labeled_transcriptions.csv', index=False)
result_df.to_excel('labeled_transcriptions.xlsx', index=False)

# Choice of Language Model (Model Options):

1. BERT (Bidirectional Encoder Representations from Transformers):
    - Pros: Great for understanding the context of text due to its bidirectional nature. Strong performance on classification tasks.
    - Cons: May require substantial computational resources for fine-tuning.

2. T5 (Text-To-Text Transfer Transformer):
    - Pros: Versatile as it treats every NLP problem as a text generation task. Can be used for various tasks, including classification.
    - Cons: Potentially more complex as it generates text, which may be unnecessary for simple classification.

3. GPT (Generative Pre-trained Transformer):
    - Pros: Powerful at generating coherent text and understanding context.
    - Cons: Not inherently designed for classification tasks, which may require additional fine-tuning.

4. BART (Bidirectional and Auto-Regressive Transformers):
    - Pros: Combines the best of BERT and GPT. Good for understanding and generating text, effective for summarization and classification.
    - Cons: May be overkill for simple binary classification.

5. Flan-T5:
    - Pros: An instruction-tuned variant of T5, which has been fine-tuned on various tasks, potentially reducing the need for extensive fine-tuning.
    - Cons: Like T5, it is versatile but might be more complex than needed for binary classification.

# Recommended Model:
BERT is a solid choice for this binary classification task. It's specifically designed for tasks like sentiment analysis, where understanding the context and nuances in text is crucial.

# Pre-trained Model vs. Training from Scratch

1. Pre-trained Model:
    - Advantages:
        - Time-Efficient: Pre-trained models like BERT have been trained on massive datasets, so they already understand language nuances.
        - Performance: Fine-tuning a pre-trained model often leads to better performance because it builds on a strong foundation.
        - Resource-Efficient: Training from scratch requires significant computational power and time, which may not be feasible.

2. Training from Scratch:
    - Disadvantages:
        - Data Requirement: Requires a large and diverse dataset to achieve the performance of a pre-trained model.
        - Time and Resources: It’s time-consuming and resource-intensive to train a model from scratch.

3. Recommendation:
    - Fine-Tuning a Pre-trained Model: Fine-tuning BERT or another pre-trained model on your labeled transcriptions is the best approach. This method leverages the existing language understanding of BERT and adapts it to your specific task, reducing the time and computational power required.

# Training Process (Model Training and Monitoring):

1. Load Pre-trained BERT Model:
    - Use a library like Hugging Face's transformers to load the pre-trained BERT model and tokenizer.

2. Prepare Dataset:
    - Convert transcriptions and labels into a format suitable for BERT (e.g., tokenized input sequences and attention masks).

3. Fine-Tuning:
    - Fine-tune the BERT model on your labeled data using a classification head on top of BERT.
    - Use a suitable optimizer like AdamW and a learning rate scheduler.

4. Monitoring Metrics:
    - Accuracy: The percentage of correctly predicted labels.
    - Precision, Recall, F1-Score: Important for understanding the balance between false positives and false negatives.
    - Loss: Monitor training and validation loss to ensure the model is learning effectively without overfitting.

5. Evaluation:
    - Validation Set: Keep a portion of your data aside for validation to evaluate the model’s performance during training.
    - Confusion Matrix: Visualize to understand where the model is making mistakes.
    - Training Logs: Use tools like TensorBoard or Weights & Biases to monitor metrics in real-time.


In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(tokenizer)


model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
print(model)

  from .autonotebook import tqdm as notebook_tqdm


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [2]:
import pandas as pd

result_df = pd.read_excel("labeled_transcriptions.xlsx")
result_df

Unnamed: 0,video_name,transcription,label
0,video_0.mp4,"Transcription sponsored by RenaissanceRe Wow, ...",positive
1,video_1.mp4,"Now to the US, and President Joe Biden today d...",negative
2,video_2.mp4,Some of the most successful people in the worl...,positive
3,video_3.mp4,Many Americans are saying the exact same thing...,negative
4,video_4.mp4,"My fellow Americans, it has been the honor of ...",positive
5,video_5.mp4,"Yeah, go ahead, please. Many times that the U....",negative
6,video_6.mp4,I absolutely loved the movie. The acting was s...,positive
7,video_7.mp4,The movie was a huge disappointment. The plot ...,negative
8,video_8.mp4,The product exceeded my expectations. It works...,positive
9,video_9.mp4,The product broke after just one use. It's a c...,negative


In [3]:
# Assuming you have a DataFrame named df
df_shuffled = result_df.sample(frac=1).reset_index(drop=True)
df_shuffled

Unnamed: 0,video_name,transcription,label
0,video_3.mp4,Many Americans are saying the exact same thing...,negative
1,video_9.mp4,The product broke after just one use. It's a c...,negative
2,video_4.mp4,"My fellow Americans, it has been the honor of ...",positive
3,video_8.mp4,The product exceeded my expectations. It works...,positive
4,video_5.mp4,"Yeah, go ahead, please. Many times that the U....",negative
5,video_52.mp4,The software is reliable and has significantly...,positive
6,video_34.mp4,I had a wonderful time at the spa. The service...,positive
7,video_17.mp4,"The event was poorly organized, and it was a w...",negative
8,video_0.mp4,"Transcription sponsored by RenaissanceRe Wow, ...",positive
9,video_36.mp4,The online course was informative and engaging...,positive


In [4]:
# Prepare the dataset
texts = df_shuffled['transcription'].tolist()
print(texts)

labels = [1 if label == 'positive' else 0 for label in df_shuffled['label']]
print(labels)

["Many Americans are saying the exact same thing about you that you should have warned them the virus was spreading like wildfire Through the month of February instead of holding rallies with thousands of people Why did you wait so long who you were and why did you not have social distancing until March 16th? Who you with? I'm Lijia Jiang with CBS News so If you look at what I did in terms of cutting off Or banning China from coming in But by the way, not Americans who are also nice and easy nice and easy just relax We cut it off people were amazed these gentlemen. Everybody was amazed that I did it. We were very early Oh, I'm the president and you know what? I just did and you know what? I just The virus was already here, okay How many people when I issued the ban how many cases of virus were in the United States when I issued the ban Do you know the number? No, no, how many pieces remember I said one person how many cases were here when I issued the ban No, no. No, do you have to do 

In [5]:
# Tokenize and encode inputs
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
print(encodings)

{'input_ids': [[101, 2116, 4841, 2024, 3038, 1996, 6635, 2168, 2518, 2055, 2017, 2008, 2017, 2323, 2031, 7420, 2068, 1996, 7865, 2001, 9359, 2066, 3748, 10273, 2083, 1996, 3204, 1997, 2337, 2612, 1997, 3173, 22867, 2007, 5190, 1997, 2111, 2339, 2106, 2017, 3524, 2061, 2146, 2040, 2017, 2020, 1998, 2339, 2106, 2017, 2025, 2031, 2591, 4487, 12693, 6129, 2127, 2233, 5767, 1029, 2040, 2017, 2007, 1029, 1045, 1005, 1049, 5622, 26541, 20613, 2007, 6568, 2739, 2061, 2065, 2017, 2298, 2012, 2054, 1045, 2106, 1999, 3408, 1997, 6276, 2125, 2030, 21029, 2859, 2013, 2746, 1999, 2021, 2011, 1996, 2126, 1010, 2025, 4841, 2040, 2024, 2036, 3835, 1998, 3733, 3835, 1998, 3733, 2074, 9483, 2057, 3013, 2009, 2125, 2111, 2020, 15261, 2122, 11218, 1012, 7955, 2001, 15261, 2008, 1045, 2106, 2009, 1012, 2057, 2020, 2200, 2220, 2821, 1010, 1045, 1005, 1049, 1996, 2343, 1998, 2017, 2113, 2054, 1029, 1045, 2074, 2106, 1998, 2017, 2113, 2054, 1029, 1045, 2074, 1996, 7865, 2001, 2525, 2182, 1010, 3100, 2129, 2116

In [6]:
# Convert to torch tensors
input_ids = torch.tensor(encodings['input_ids'])
print(input_ids)

attention_masks = torch.tensor(encodings['attention_mask'])
print(attention_masks)

labels = torch.tensor(labels)
print(labels)

tensor([[  101,  2116,  4841,  ...,  1010,  3531,   102],
        [  101,  1996,  4031,  ...,     0,     0,     0],
        [  101,  2026,  3507,  ...,     0,     0,     0],
        ...,
        [  101,  1996,  3784,  ...,     0,     0,     0],
        [  101,  1996, 10439,  ...,     0,     0,     0],
        [  101,  1996,  2833,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 0, 0, 0])


In [7]:
# Create a custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

In [8]:
# Create dataset and split into training and validation sets
dataset = CustomDataset(input_ids, attention_masks, labels)
print(dataset)

train_size = int(0.8 * len(dataset))
print(train_size)

val_size = len(dataset) - train_size
print(val_size)

<__main__.CustomDataset object at 0x13478f130>
44
12


In [9]:
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
print(train_dataset, val_dataset)

<torch.utils.data.dataset.Subset object at 0x13478d090> <torch.utils.data.dataset.Subset object at 0x13478e0b0>


In [10]:
import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

os.makedirs("results", exist_ok=True)
os.makedirs("logs", exist_ok=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',
    logging_steps=10,
)
print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=False,
evaluation_strategy=None,
fp16=

In [11]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
print(trainer)

<transformers.trainer.Trainer object at 0x13478d240>


In [12]:
vars(trainer)

{'args': TrainingArguments(
 _n_gpu=1,
 accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
 adafactor=False,
 adam_beta1=0.9,
 adam_beta2=0.999,
 adam_epsilon=1e-08,
 auto_find_batch_size=False,
 batch_eval_metrics=False,
 bf16=False,
 bf16_full_eval=False,
 data_seed=None,
 dataloader_drop_last=False,
 dataloader_num_workers=0,
 dataloader_persistent_workers=False,
 dataloader_pin_memory=True,
 dataloader_prefetch_factor=None,
 ddp_backend=None,
 ddp_broadcast_buffers=None,
 ddp_bucket_cap_mb=None,
 ddp_find_unused_parameters=None,
 ddp_timeout=1800,
 debug=[],
 deepspeed=None,
 disable_tqdm=False,
 dispatch_batches=None,
 do_eval=False,
 do_predict=False,
 do_train=False,
 eval_accumulation_steps=None,
 eval_delay=0,
 eval_do_concat_batches=True,
 eval_on_start=False,
 eval_steps=None,
 eval_strategy=no,
 eval_use_gather_o

In [13]:
# If you get error while running this cell then run the below command and restart the kernel.
# pip install -U transformers accelerate

# Train model
trainer.train()

  7%|▋         | 10/150 [04:11<1:05:06, 27.90s/it]

{'loss': 0.7695, 'grad_norm': 4.280591011047363, 'learning_rate': 1.0000000000000002e-06, 'epoch': 3.33}


 13%|█▎        | 20/150 [07:25<43:06, 19.90s/it]  

{'loss': 0.7676, 'grad_norm': 4.8491644859313965, 'learning_rate': 2.0000000000000003e-06, 'epoch': 6.67}


 20%|██        | 30/150 [10:38<38:19, 19.16s/it]

{'loss': 0.6944, 'grad_norm': 4.166291236877441, 'learning_rate': 3e-06, 'epoch': 10.0}


 27%|██▋       | 40/150 [13:55<35:59, 19.63s/it]

{'loss': 0.6139, 'grad_norm': 3.897453546524048, 'learning_rate': 4.000000000000001e-06, 'epoch': 13.33}


 33%|███▎      | 50/150 [17:12<33:23, 20.03s/it]

{'loss': 0.533, 'grad_norm': 6.091464519500732, 'learning_rate': 5e-06, 'epoch': 16.67}


 40%|████      | 60/150 [20:27<28:48, 19.21s/it]

{'loss': 0.4278, 'grad_norm': 7.434052467346191, 'learning_rate': 6e-06, 'epoch': 20.0}


 47%|████▋     | 70/150 [23:44<26:16, 19.71s/it]

{'loss': 0.2874, 'grad_norm': 5.71469783782959, 'learning_rate': 7.000000000000001e-06, 'epoch': 23.33}


 53%|█████▎    | 80/150 [27:03<23:29, 20.13s/it]

{'loss': 0.1731, 'grad_norm': 3.5043447017669678, 'learning_rate': 8.000000000000001e-06, 'epoch': 26.67}


 60%|██████    | 90/150 [30:18<19:17, 19.29s/it]

{'loss': 0.1165, 'grad_norm': 2.9528470039367676, 'learning_rate': 9e-06, 'epoch': 30.0}


 67%|██████▋   | 100/150 [33:36<16:32, 19.84s/it]

{'loss': 0.0845, 'grad_norm': 2.3191349506378174, 'learning_rate': 1e-05, 'epoch': 33.33}


 73%|███████▎  | 110/150 [36:55<13:25, 20.15s/it]

{'loss': 0.0552, 'grad_norm': 1.6318941116333008, 'learning_rate': 1.1000000000000001e-05, 'epoch': 36.67}


 80%|████████  | 120/150 [40:10<09:38, 19.27s/it]

{'loss': 0.0349, 'grad_norm': 0.762492835521698, 'learning_rate': 1.2e-05, 'epoch': 40.0}


 87%|████████▋ | 130/150 [43:29<06:36, 19.84s/it]

{'loss': 0.0197, 'grad_norm': 0.47781139612197876, 'learning_rate': 1.3000000000000001e-05, 'epoch': 43.33}


 93%|█████████▎| 140/150 [46:47<03:21, 20.10s/it]

{'loss': 0.0117, 'grad_norm': 0.24635422229766846, 'learning_rate': 1.4000000000000001e-05, 'epoch': 46.67}


100%|██████████| 150/150 [50:01<00:00, 19.18s/it]

{'loss': 0.0073, 'grad_norm': 0.16353172063827515, 'learning_rate': 1.5e-05, 'epoch': 50.0}


100%|██████████| 150/150 [50:05<00:00, 20.03s/it]

{'train_runtime': 3005.0748, 'train_samples_per_second': 0.732, 'train_steps_per_second': 0.05, 'train_loss': 0.3064325519402822, 'epoch': 50.0}





TrainOutput(global_step=150, training_loss=0.3064325519402822, metrics={'train_runtime': 3005.0748, 'train_samples_per_second': 0.732, 'train_steps_per_second': 0.05, 'total_flos': 452222126400000.0, 'train_loss': 0.3064325519402822, 'epoch': 50.0})

In [14]:
# Evaluate model
trainer.evaluate()

100%|██████████| 1/1 [00:01<00:00,  1.19s/it]


{'eval_loss': 0.7322338223457336,
 'eval_runtime': 7.0529,
 'eval_samples_per_second': 1.701,
 'eval_steps_per_second': 0.142,
 'epoch': 50.0}

# Results:
1. Choice of Model: BERT is recommended for its strong performance on text classification tasks.
2. Training Approach: Fine-tuning a pre-trained BERT model is efficient and effective for your binary classification task.
3. Monitoring: Track accuracy, precision, recall, F1-score, and loss during training to ensure your model is learning effectively.

This approach ensures a robust and efficient process for training a model to classify YouTube video transcriptions as positive or negative.

# --------------------------------------------------------

Once you have fine-tuned your BERT model and stored the output in the results folder, you can use this model to classify new text inputs as positive or negative. Here's how you can load the fine-tuned model and use it for classification:

# 1. Load the Fine-Tuned Model
You can load the fine-tuned model from the results directory where it was saved after training.

In [15]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('./results/checkpoint-150')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# 2. Prepare the Input Text
To classify a new text, you need to tokenize the input text and convert it into a format that BERT can process.

In [16]:
def preprocess_text(text):
    # Tokenize the input text
    encodings = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors="pt")
    return encodings

# 3. Perform the Classification
With the input text preprocessed, you can pass it through the model to get predictions.

In [17]:
def classify_text(model, encodings):
    # Get model predictions
    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    
    # Map prediction to label
    label = 'positive' if predicted_class == 1 else 'negative'
    return label

# 4. Putting It All Together
You can create a function that takes a raw text input, preprocesses it, and returns the classification result.

In [20]:
def classify_new_text(text):
    encodings = preprocess_text(text)
    label = classify_text(model, encodings)
    return label

# Example usage
new_text = "This product is really amazing and works well!"
prediction = classify_new_text(new_text)
print(f'The sentiment of the text is: {prediction}')

The sentiment of the text is: positive


In [21]:
# Example usage
new_text = "This product is really cheap and works not well!"
prediction = classify_new_text(new_text)
print(f'The sentiment of the text is: {prediction}')

The sentiment of the text is: negative


# 5. Use in a Larger Application
You can integrate this classification function into a larger application, such as a Flask API, Django web app, to classify text inputs in real-time.

# Summary of Steps:
- Load the Model: Load the fine-tuned BERT model from the results directory.
- Preprocess the Text: Tokenize and encode the input text using the BERT tokenizer.
- Classify the Text: Pass the tokenized input through the model to get the prediction.
- Interpret the Output: Convert the model's output logits into a label (positive or negative).

This approach allows you to take any new text input and classify it using your fine-tuned model. The model is now ready for deployment in various applications where sentiment analysis or binary classification is required.