In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install pyannote.audio
!pip install torchaudio
!pip install pydub
!pip install yt-dlp

In [None]:

# üìÅ Imports
from pyannote.audio import Pipeline
import whisper
from pydub import AudioSegment
from IPython.display import Video, Audio
import subprocess, json, os
from datetime import datetime

token = "XXXXXXXXXXXXXXXXXXXXXXXX"
#Remplacer

# üìç Configuration
youtube_url = "https://www.youtube.com/watch?v=XXXXXXXXXXX"  # √† remplacer
start_time = 0
end_time = 60
archive_title = "ARCHIVE_008"
whisper_model = "large"
min_speakers = 2
max_speakers = 4
target_len = 7
tolerance = 3
pause_threshold = 0.8
verbose = True

In [None]:

# üîç Download and clip video
video_path = "downloaded_video.mp4"
subprocess.run(["yt-dlp", "-o", video_path, youtube_url], check=True)
clipped_video = "clip_video.mp4"
duration = float(end_time) - float(start_time)
subprocess.run(["ffmpeg", "-y", "-ss", str(start_time), "-i", video_path, "-t", str(duration), "-c", "copy", clipped_video], check=True)
wav_path = "clip_audio.wav"
subprocess.run(["ffmpeg", "-y", "-ss", str(start_time), "-i", video_path, "-t", str(duration), "-ac", "1", "-ar", "16000", "-vn", wav_path], check=True)
os.remove(video_path)
display(Video(clipped_video))
display(Audio(wav_path))
print(f"‚úÖ Video and audio prepared: {clipped_video}, {wav_path}")

In [None]:

# üìå Transcription avec Whisper
model = whisper.load_model(whisper_model)
result = model.transcribe(wav_path, word_timestamps=True)
segments = result["segments"]
print(f"‚úÖ {len(segments)} segments transcrits")


In [None]:

# üß† Diarisation
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=token)

dz = pipeline(
    {"uri": "audio", "audio": wav_path},
    min_speakers=min_speakers,
    max_speakers=max_speakers
)
print("‚úÖ Diarisation termin√©e")

# üéôÔ∏è Indexation par temps
speaker_timeline = []
for turn, _, speaker in dz.itertracks(yield_label=True):
    speaker_timeline.append({
        "start": turn.start,
        "end": turn.end,
        "speaker": speaker
    })


In [None]:

# üß© Construction du JSON avec d√©coupage dynamique (~7 mots)
final_data = {
    "title": archive_title,
    "instrumentals": [
        {"start": 0.0, "end": 60.0, "title": "PLACEHOLDER 1", "artist": "TODO"},
        {"start": 60.0, "end": 120.0, "title": "PLACEHOLDER 2", "artist": "TODO"},
        {"start": 120.0, "end": 180.0, "title": "PLACEHOLDER 3", "artist": "TODO"}
    ],
    "reloads": [],
    "segments": [],
    "overlaps": {"manual": [], "auto": []}
}

def find_speaker(start, end):
    for s in speaker_timeline:
        if s["start"] <= start < s["end"]:
            return s["speaker"]
    return "unknown"

def check_speaker_consistency(start, end):
    active = [s for s in speaker_timeline if not (s['end'] <= start or s['start'] >= end)]
    return len(set(a['speaker'] for a in active)) == 1

def split_by_diction(words, target_len=7, tolerance=3, pause_threshold=0.8):
    max_words = target_len + tolerance
    min_words = max(1, target_len - tolerance)
    punctuation = ('.', '!', '?', ',', ';', '‚Ä¶')
    chunks = []
    current = []
    last_end = None
    for word in words:
        w = word["word"].strip() if isinstance(word, dict) else word.word.strip()
        start = word["start"] if isinstance(word, dict) else word.start
        if last_end is not None and start - last_end > pause_threshold:
            if current:
                chunks.append(current)
                current = []
        current.append(word)
        last_end = word["end"] if isinstance(word, dict) else word.end
        if len(current) >= max_words or w.endswith(punctuation):
            chunks.append(current)
            current = []
    if current:
        if len(current) < min_words and chunks:
            chunks[-1].extend(current)
        else:
            chunks.append(current)
    return chunks

segment_id = 0
for seg in segments:
    words = seg["words"]
    for word in words:
        word.pop("seek", None)
        word["chorus"] = False
    chunks = split_by_diction(words, target_len=target_len, tolerance=tolerance, pause_threshold=pause_threshold)
    for chunk in chunks:
        start_time = chunk[0]["start"]
        end_time = chunk[-1]["end"]
        spk = find_speaker(start_time, end_time)
        segment = {
            "id": segment_id,
            "start": start_time,
            "end": end_time,
            "text": ' '.join([w["word"] for w in chunk]),
            "words": chunk,
            "speaker": spk,
            "instrumental": None
        }
        final_data["segments"].append(segment)
        single = check_speaker_consistency(start_time, end_time)
        if verbose and (len(chunk) > target_len + tolerance or len(chunk) < max(1, target_len - tolerance)):
            print(f'‚ö†Ô∏è segment {segment_id} longueur inhabituelle: {len(chunk)} mots')
        if verbose and not single:
            print(f'‚ö†Ô∏è segment {segment_id} contient plusieurs locuteurs')
        if not single:
            final_data["overlaps"]["auto"].append(segment.copy())
        segment_id += 1

# Export
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"archive_output_{timestamp}.json"
with open(filename, "w") as f:
    json.dump(final_data, f, indent=2)

print(f"‚úÖ Fichier export√© : {filename}")
