# Import libraries

In [None]:
# Built In libraries
import json
from pathlib import Path # used to get file names

# Misc Libraries
import ffmpeg # Audio processor
import numpy as np

# Hugging Face and Pytorch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, QuantoConfig
import torch

In [None]:
# Hugging Face Parameters

# Device
# - Hugging face is device-agnostic & can run on CUDA GPUs, MPS GPUs, or the CPU.
device = torch.device("cuda") # Changes the device for Pytorch. Change to suit.
device_map = "cuda:0" # Device_map is a newer method to set the device for Hugging Face.


# Quantization Makes the models less complicated
# - Allows for faster running speed & smaller memory Size
quantization_config = QuantoConfig(weights="int8")

# Loading Of data
This notebook loads the data from the "data/video" directory. Below you set the names of the video files you want transcribed by the program. The notebook also checks if the video has already been transcribed and will skip if that is the case. After the videos are transcribed, the transcripts will be saved to the transcripts.json file.

In [None]:
# list of files to be converted
video_filenames = [
    "WSJ_VisionProReview.mp4",
    "MarquesBrownlee.mkv",
    "BrianTong-Review.mkv",
    "iJustine-Unboxing&Review.mkv"
]

In [None]:
# The transcripts file holds all the transcripts that the ASR model has transcribed
data_file = "data/transcripts.json"

with open(data_file, 'r') as f:
    t_data = json.load(f)

In [None]:
# Compare existing files to files available and return a list of videos that need to be transcribed.

# Get file names currently in dictionary
existing_files = [t for t in t_data]

# Files to be transcribed
to_transcribe = []

# compare the two
for f in video_filenames:

    # Removes extension or directories
    name = Path(f).stem

    # Compare the name to existing files
    if name not in existing_files:
        to_transcribe.append(f) # add to be transcribed list

to_transcribe

## Transcribe videos to audio
In this step the files that are in need of transcribing are converted from their video format into an appropriate audio format. It is possible to directly convert a video to a transcript. However, the ASR models can be picky on what the video format is. Therefore, I recommend adding this step to convert all videos to the same audio format. 

In [None]:
# Does the actual converting
def fileConverter(filename=str, in_dir="data/video/", out_dir="data/audio/"):
    
    in_filename = in_dir + filename

    # Need to change to .wav
    just_file = Path(filename).stem
    output_filename = out_dir + just_file + ".wav"

    # Fix file formats using ffmpeg
    # NOTE This basically creates a loop to go down
    (
        ffmpeg # Launch fmmpeg
        .input(in_filename) # Input the file
        .output(output_filename) # Output the file
        .overwrite_output() # overwrite the file if it already exist
        .run() # Runs the commands above
    )

    return output_filename

output_files = [fileConverter(i) for i in to_transcribe]

## Transcribe audio to text
A automatic speech recognition (ASR) model is used to transcribe audio to text. In this case I used the OpenAI whisper model, however there are many other models available. The Hugging Face pipelines api makes it as easy as changing the model name to switch to a different model if desired. Becuase of the pipeline there should be no change in how the program is used.

In [None]:
# Load the ASR pipeline from hugging face
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)

In [None]:

# run through each output file
for f in output_files:
    
    # Run the transcription
    transcript = asr_pipe(f)

    # Add the data to the transcription file
    title = Path(f).stem # Get the file name
    t_data[title] = transcript # save the transcript to that file name

In [None]:
# Save transcripts to the transcripts file
with open("data/transcripts.json", 'w') as f:
    json.dump(t_data, f)

## Sentiment Analysis
For sentiment analysis I used the bart model. The MNLI version of the model lets you ask the model how related the text is to the keywords you input. In this case I asked for how positive and negative the text was, and if the reviewer found the Apple Vision Pro practical or not. After the model makes the predictions for the text. The probabilities are saved along with the transcripts file. The last notebook cell in this section provides the average sentiment for all of the transcripts in the transcripts file.

In [None]:
classify_pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

In [None]:
# The function below allows you to input text & labels/keywords you are looking for in the text.
def sentiment_analysis(data, labels=['positive', 'negative'], output_key="sentiment"):
    for t in data:
        
        # Get the text
        text = data[t]['text']
        
        # get the scores
        scores = classify_pipe(text, labels)['scores']

        # Round the scores
        scores = [round(s, 3) for s in scores]

        # Save the scores
        data[t][output_key] = dict(zip(labels, scores))

    return data

In [None]:
# Find if reviewers look at the Apple Vision Pro positively.
t_data = sentiment_analysis(t_data)
# Find if reviewers find the Apple Vision Pro as practical
t_data = sentiment_analysis(t_data, labels=['practical', 'unpractical'], output_key="practicality")

# Save data to file
# Save transcripts as json
with open("data/transcripts.json", 'w') as f:
    json.dump(t_data, f)

In [None]:
# Average the sentiment scores
practicality_scores = []
sentiment_scores = []

for t in t_data:
    practicality_scores.append(t_data[t]['practicality']['practical'])
    sentiment_scores.append(t_data[t]['sentiment']['positive'])

print("Sentiment", round(np.mean(sentiment_scores), 4))
print("Practicality", round(np.mean(practicality_scores), 4))

## Summarization
To summarize the videos I used a large language model (LLM) to summarize the top 3 good and bad things the reviewer said about the Apple Vision Pro. There are many LLM models available to download from Hugging Face, and for this notebook I chose to use the Mistral model. This process does not use the pipelines API but the LLM specific Hugging Face Process. Just like pipelines it is as simple as changing the model name to use a different model. 

In [None]:
# Read the file
data_file = "data/transcripts.json"
with open(data_file, 'r') as f:
    t_data = json.load(f)

In [None]:
# Load the LLM model
llm_model = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(llm_model, device_map="cuda:0")
model = AutoModelForCausalLM.from_pretrained(llm_model, device_map="cuda:0")#, quantization_config=quantization_config)

In [None]:
# Run the model on the data
# WARNING - This will take a long time to run
for transcript in t_data:
    current_transcript = t_data[t_data]['text']
    
    message = [
        {
            "role": "user",
            "content": "I need you to summarize the top 4 good and bad things the reviewer said about the Apple Vision Pro"
        },
        {
            "role": "assistant",
            "content": "I will summarize the top 3 good and bad things the reviewer said about the Apple Vision Pro."
        },
        {
            "role" : "user",
            "content": current_transcript
        }
    ]
    
    # Text must be tokenized for the model to interpret
    tokenized_chat = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=True, return_tensors="pt").to('cuda')
    input_length = tokenized_chat.shape[1]
    
    # This generates the response from the model
    generated_ids = model.generate(tokenized_chat, max_new_tokens=128, device_map=device_map)

    # Save the response to the transcripts dictionary
    t_data[t_data]['LLM_response'] = tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0]

In [None]:
# Save transcripts to the transcripts file
with open("data/transcripts.json", 'w') as f:
    json.dump(t_data, f)