In [1]:
"""
Contact: jean-baptiste.daudet@proximity.fr
Owner: JB Daudet
Creation date: 22/05/2024
Last modification date: 22/05/2024

Description:
  Capture data around how people are doing their chores

Outputs:
  TBD
"""

'\nContact: jean-baptiste.daudet@proximity.fr\nOwner: JB Daudet\nCreation date: 22/05/2024\nLast modification date: 22/05/2024\n\nDescription:\n  Capture data around how people are doing their chores\n\nOutputs:\n  TBD\n'

### IMPORTS

In [2]:
import os
import pytube
from pydub import AudioSegment
import speech_recognition as sr
import pandas as pd



In [7]:


def download_youtube_video(video_url, output_path='output'):
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    yt = pytube.YouTube(video_url)
    video = yt.streams.filter(only_audio=True).first()
    output_file = video.download(output_path)
    return output_file

def convert_audio_to_wav(audio_file, output_format='wav'):
    audio = AudioSegment.from_file(audio_file)
    wav_file = f"{os.path.splitext(audio_file)[0]}.{output_format}"
    audio.export(wav_file, format=output_format)
    return wav_file

def transcribe_audio_chunk(recognizer, audio_chunk):
    try:
        text = recognizer.recognize_google(audio_chunk)
        return text
    except sr.UnknownValueError:
        return "(Unintelligible)"
    except sr.RequestError as e:
        return f"(RequestError: {e})"

def transcribe_audio(audio_file, video_url, chunk_length=30000):
    recognizer = sr.Recognizer()
    audio = AudioSegment.from_wav(audio_file)
    transcriptions = []

    for i in range(0, len(audio), chunk_length):
        start_time = i / 1000  # Convert to seconds
        audio_chunk = audio[i:i + chunk_length]
        audio_chunk_wav = f"temp_chunk_{i}.wav"
        audio_chunk.export(audio_chunk_wav, format="wav")
        with sr.AudioFile(audio_chunk_wav) as source:
            audio_data = recognizer.record(source)
            text = transcribe_audio_chunk(recognizer, audio_data)
            transcriptions.append({'start_time': start_time, 'text': text, 'video_url': video_url})

        # Remove the temporary WAV file
        os.remove(audio_chunk_wav)
    
    return transcriptions


In [12]:
def main(video_urls):
    for video_url in video_urls:
        print(f"Processing video: {video_url}")
        audio_file = download_youtube_video(video_url)
        wav_file = convert_audio_to_wav(audio_file)

        transcriptions = transcribe_audio(wav_file, video_url)
        
        # Extract video ID from the URL
        video_id = video_url.split('v=')[1]
        individual_csv_file = f'transcriptions_{video_id}.csv'
        
        # Save the individual transcription file
        df = pd.DataFrame(transcriptions)
        df.to_csv(individual_csv_file, index=False)
        
        # Append to the main transcription file
        with open(individual_csv_file, 'r') as ind_file:
            if not os.path.exists('transcriptions.csv'):
                with open('transcriptions.csv', 'w') as main_file:
                    main_file.write(ind_file.read())
            else:
                with open('transcriptions.csv', 'a') as main_file:
                    next(ind_file)  # Skip header row
                    main_file.write(ind_file.read())
        
        # Remove the temporary files
        os.remove(audio_file)
        os.remove(wav_file)

    print("Transcription completed and saved to individual and main CSV files.")

if __name__ == "__main__":
    # List of YouTube video URLs to process
    video_urls = [
        "https://www.youtube.com/watch?v=2Pcrr6I1WLA",
        "https://www.youtube.com/watch?v=6e5IMM1mrTc",
        "https://www.youtube.com/watch?v=9NGpZ3QL7go",
        "https://www.youtube.com/watch?v=n_O_b3Xfvkc",
        "https://www.youtube.com/watch?v=fJqZb5RejUs",
        "https://www.youtube.com/watch?v=ajr_Jc5GpTE"
    ]
    main(video_urls)

Processing video: https://www.youtube.com/watch?v=2Pcrr6I1WLA
Processing video: https://www.youtube.com/watch?v=6e5IMM1mrTc
Processing video: https://www.youtube.com/watch?v=9NGpZ3QL7go
Processing video: https://www.youtube.com/watch?v=n_O_b3Xfvkc
Processing video: https://www.youtube.com/watch?v=fJqZb5RejUs
Processing video: https://www.youtube.com/watch?v=ajr_Jc5GpTE
Transcription completed and saved to individual and main CSV files.


# add a summary of the chunk of text with chatGPT

In [3]:
from openai import OpenAI

import time

# Set up OpenAI API key
openai_api_key = 'sk-proj-mkYh2wNLQ3lvJQxSxXtaT3BlbkFJKqYrRJxc2g3w5aGxN1rK'

client = OpenAI(api_key=openai_api_key)

In [7]:
def summarize_text(text):
    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4-turbo",
                messages=[{'role':'user', 'content':f"sum up in one sentence the following text: {text}"}])
            return response.choices[0].message.content

        except:
            print("let's have a 10sec. nap, shall we?")
            time.sleep(10)

In [8]:

def add_summaries_to_transcriptions(file_in, file_out):
    # Read the transcriptions CSV file
    df = pd.read_csv(file_in)

    # Apply the summarization function to each row in the DataFrame
    df['summary'] = df['text'].apply(summarize_text)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(file_out, index=False)
    print(f"Summaries added and saved to {file_out}.")

add_summaries_to_transcriptions('transcriptions.csv', 'transcriptions_with_summary.csv')

Summaries added and saved to transcriptions_with_summary.csv.
