#PolitoSbobinatore

### Installation of Dependencies and Environment Setup
In the next block, we will install the necessary dependencies and set up the environment to run the subsequent code. This includes installing libraries such as Whisper, MoviePy, Transformers, Selenium, and others, as well as configuring the Chromium browser.

In [None]:
import time
total_time = 0
start_time = time.time()

!pip install git+https://github.com/openai/whisper.git 
!sudo apt update && sudo apt install ffmpeg
!pip install moviepy 
!pip install --upgrade transformers torch
!pip install torch 
!pip install transformers  
!pip install selenium webdriver_manager requests
!apt-get update -y
!apt-get install -y chromium-browser
!apt-get install -y chromedriver
!apt install chromium-chromedriver

end_time = time.time()
total_time = total_time + end_time - start_time

# Insert Your Polito Credentials and Course Information

Before running the script, fill in your PoliTo credentials and the course details:


In [38]:
#INSERT HERE YOUR DATA
USERNAME = "****" #POLITO USERNAME
PASSWORD =  "****" #POLITO PASSWORD
COURSE_TITLE = "*****" #COURSE TITLE
LECTURE_TITLE = "******" #LECTURE TITLE


# Automated Login and Video Download from Virtual Classroom

This script uses Selenium to automate the login to the Politecnico di Torino portal, navigate through the platform, and download the lecture video from the Virtual Classroom. The process includes:

1. Automatic login to the PoliTo portal with credentials.
2. Navigation to the specified course and desired lecture.
3. Extraction of the video URL from the lecture page.
4. Downloading the video as an MP4 file to the local system.

The browser is run in headless mode (without a graphical interface) for faster processing and without the need for manual interaction.


In [None]:
start_time = time.time()
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import requests

# Settings to use Chromium in headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')  # Headless mode
chrome_options.add_argument('--no-sandbox')  # Necessary for running in a container environment
chrome_options.add_argument('--disable-dev-shm-usage')  # Necessary to avoid memory issues

# Configure the Chromium driver
driver = webdriver.Chrome(options=chrome_options)

# URL and credentials
URL = "https://idp.polito.it/home"
USERNAME_FIELD_ID = "username"
PASSWORD_FIELD_ID = "password"
LOGIN_BUTTON_CSS_SELECTOR = "button.login.button"

# Maximize the window
driver.maximize_window()

try:
    # Open the login page
    driver.get(URL)

    # Find the username field and enter the value
    username_box = driver.find_element(By.ID, USERNAME_FIELD_ID)
    username_box.send_keys(USERNAME)

    # Find the password field and enter the value
    password_box = driver.find_element(By.ID, PASSWORD_FIELD_ID)
    password_box.send_keys(PASSWORD)

    # Find and click the login button
    login_button = driver.find_element(By.CSS_SELECTOR, LOGIN_BUTTON_CSS_SELECTOR)
    login_button.click()

    # Wait 10 seconds for the page to load
    time.sleep(10)

    if(driver.title == "Portale della Didattica - Studente"):
      print("Login Completed ")
    else:
      print("Login Failed")
      driver.quit()

    if "MyPoli" in driver.title:
        link_portale = driver.get("https://didattica.polito.it/pls/portal30/sviluppo.pagina_studente_2016.main")


    # Print the page title for verification
    print("Page title:", driver.title)

    # Navigate to the specified course using the course title
    link_Course = driver.find_element(By.LINK_TEXT, COURSE_TITLE)
    link_Course.click()

    # Click the link for the "Virtual classroom"
    link_virtualClass = driver.find_element(By.LINK_TEXT, "Virtual classroom")
    link_virtualClass.click()

    # Find and click the link for the specific lecture "Efficient fine-tuning, inference"
    link_vc = driver.find_element(By.XPATH, f"//a[text()='{LECTURE_TITLE}']")
    link_vc.click()

    # Wait 10 seconds for the video to load
    time.sleep(10)

    # Find the video element using the class 'video-js' and get the video URL
    video_element = driver.find_element(By.CLASS_NAME, "video-js")
    video_url = video_element.find_element(By.TAG_NAME, "source").get_attribute("src")

    # Download the video using the obtained URL
    print(f"Video URL found: {video_url}")
    video_response = requests.get(video_url)

    # Save the video to disk with the name "video.mp4"
    with open("video.mp4", "wb") as video_file:
        video_file.write(video_response.content)
    print("Video downloaded successfully!")

    # Print the page title for verification
    print("Page title:", driver.title)

finally:
    # Wait 10 seconds to see the result
    time.sleep(10)
    # Close the browser
    driver.quit()
end_time = time.time()
total_time = total_time + end_time - start_time

# Extracting Audio from Video and Transcription with Whisper

This script uses `moviepy` to extract audio from a video file and save it as an MP3 file, then uses OpenAI's Whisper model for potential transcription. The process includes:

1. Loading the video file in MP4 format.
2. Extracting the audio and saving it as an MP3 file.
3. Optionally, using Whisper for audio transcription (not included in this snippet but could be added).
   
The result is an MP3 file that contains the audio from the video, ready for further processing or transcription.


In [None]:
from moviepy import *
import whisper
import time

start_time = time.time()

# Load the Whisper model
model = whisper.load_model("small")  # Load the model on the appropriate device

# Path to the MP4 file
input_file = "video.mp4"
# Name of the output MP3 file
output_file = "audio.mp3"

# Load the video
video = VideoFileClip(input_file)

# Extract the audio and save it as MP3
video.audio.write_audiofile(output_file)

print(f"Conversion completed! File saved as: {output_file}")

# End the timer and calculate the total time
end_time = time.time()
total_time = total_time + end_time - start_time
print(f"Total operation time: {total_time:.4f} seconds")


# Audio Transcription with Whisper

This script uses OpenAI's Whisper model to transcribe audio from an MP3 file into text. The process includes:

1. Loading the audio file in MP3 format.
2. Transcribing the audio using the Whisper model.
3. Saving the transcription as a text file.

The result is a text file containing the full transcription of the audio, ready for review or further processing.


In [41]:
start_time = time.time()
# Path to the MP3 file
input_audio = "audio.mp3"
# Name of the output file
output_text = "trascription.txt"

# Transcribe the audio
result = model.transcribe(input_audio)

# Save the transcription to a text file
with open(output_text, "w", encoding="utf-8") as file:
    file.write(result["text"])

end_time = time.time()
total_time = total_time + end_time - start_time

# Text Summarization with BART

This script uses the BART model from Hugging Face's Transformers library to generate a summary of a given transcription. The process includes:

1. Reading the transcription text from a file.
2. Loading the BART model for summarization (`facebook/bart-large-cnn`).
3. Preparing the content for summarization.

This script sets up the model and tokenizer, ready to summarize the transcription text into a concise version.


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import time

# Check if a GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available, otherwise use CPU
print(f"Using device: {device}")  # Print which device is being used (GPU or CPU)

start_time = time.time()  # Start measuring the time to load the model

# Load the BART model and tokenizer
print("Loading the BART model for summarization...")  # Notify that model loading has started
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')  # Load the tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)  # Load the model and move it to the device (GPU/CPU)

end_time = time.time()  # End measuring the time

# Calculate the total time taken for loading the model and processing
total_time = total_time + end_time - start_time
print(f"Time taken to load the model and process the file: {total_time:.4f} seconds")  # Print the total time taken


# Chunk-Based Summarization of Transcription with BART

This script generates a summary for a large transcription by breaking it into smaller chunks, each of which is summarized separately. The process includes:

1. Splitting the transcription text into chunks of a maximum length (1024 characters).
2. Using the BART model to summarize each chunk with the context of previous summaries.
3. Concatenating the individual summaries to create the final summarized text.
4. Saving the final summary as a text file.

This approach ensures that even long transcriptions can be summarized efficiently while maintaining coherence across chunks.


In [None]:
# File name to read
file_path = "trascription.txt"

# Open and read the content of the file
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

transcription_text = content
start_time = time.time()  # Start the timer to measure the total operation time.

# Maximum length for each chunk in tokens
max_length = 1024
max_length_chunk = 900  # Maximum length for a single chunk in tokens.

# Tokenize the full transcription text without truncation to calculate its total length.
prompt_tokenized_test = tokenizer(transcription_text, return_tensors="pt", truncation=False, padding=True)
len_p = prompt_tokenized_test["input_ids"].shape[1]  # Get the total number of tokens in the text.

# Calculate the number of chunks based on the maximum chunk length.
N = len_p // max_length_chunk
text_length = len(transcription_text)
chunk_size = text_length // N  # Calculate approximate character length per chunk.

# Split the text into N equal parts.
chunks = [transcription_text[i:i + chunk_size] for i in range(0, text_length, chunk_size)]

# Ensure the last chunk contains any remaining characters.
if len(chunks) > N:
    chunks[-2] += chunks[-1]  # Merge the last chunk with the second last.
    chunks = chunks[:-1]  # Remove the last (now empty) chunk.

# List to store the summaries of each chunk.
summaries = []

# Process each chunk to generate summaries.
for i, chunk in enumerate(chunks):
    # Create a prompt for summarization including the context of the previous summary.
    prompt = f"""
    {chunk}
    """
    # Tokenize the prompt and move it to the GPU.
    prompt_tokenized = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=1024).to(device)

    minThreshold = int((i / len(chunks)) * 1000)  # Gradually increase the minimum threshold for summary length.

    # Generate the summary using the model with adjusted parameters
    outputs = model.generate(
        prompt_tokenized["input_ids"],
        max_length=1024,  # Maximum length for the summary
        min_length=300,  # Increase the minimum length to ensure more detail
        do_sample=True,  # Enable sampling for more variability in the output
        temperature=0.2,  # Set temperature higher for more diversity in the responses
        top_p=0.9,  # Increase top_p to consider a wider pool of tokens
        length_penalty=1.0,  # Control length penalties to avoid very short outputs
    )

    # Decode the generated summary and append it to the summaries list.
    new_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summaries.append(new_summary)

    # Update the context with the latest summary for the next chunk.

    print(f"Processed chunk [{i + 1}/{len(chunks)}]")

# End the timer.
end_time = time.time()
# Calculate the total operation time.
total_time = total_time + end_time - start_time

# Save all chunk summaries to a separate file.
joined_output_path = "final_summary.txt"
with open(joined_output_path, "w", encoding="utf-8") as file:
    for i, summary in enumerate(summaries, start=1):
        file.write(summary)
        file.write("\n")  # Add a blank line between summaries.
    file.write(f"The total operation is completed in: {int(total_time // 60)} minutes and {total_time % 60:.2f} seconds")


print(f"Summary completed! Final summaries saved in: {joined_output_path}")
