In [1]:
import os
import re
import glob
import subprocess
import requests
import ast
import webvtt
import json
from dotenv import load_dotenv
from openai import OpenAI
from ast import literal_eval

In [2]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None)

In [3]:
llm_client = OpenAI(api_key=OPENAI_API_KEY)

### Download init files (video + audio)

In [None]:
url = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/init-f3-v1.mp4"

with open(os.path.join('video', 'init-f3-v1.mp4'), 'wb') as out_file:
    response = requests.get(url)
    out_file.write(response.content)

url = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/init-f1-a1.mp4"

with open(os.path.join('video', 'init-f1-a1.mp4'), 'wb') as out_file:
    response = requests.get(url)
    out_file.write(response.content)

### Download all m4s video segments

In [None]:
BASE_URL = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/"
for i in range(1, 233):
    filename = f"seg-{i}-f3-v1.m4s"
    url = os.path.join(BASE_URL, filename)
    response = requests.get(url)
    with open(os.path.join('video/video_segments', filename), 'wb') as out_file:
        out_file.write(response.content)

### Download all m4s audio segments

In [None]:
BASE_URL = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/"
for i in range(1, 233):
    filename = f"seg-{i}-f1-a1.m4s"
    url = os.path.join(BASE_URL, filename)
    response = requests.get(url)
    with open(os.path.join('video/audio_segments', filename), 'wb') as out_file:
        out_file.write(response.content)

### Generate mp4 with audio

In [None]:
# run bash script 'create_clips.sh' in examples/video/video_segments
# add audio segments to /video folder

In [None]:
ffmpeg -i "concat:../init-f3-v1.mp4|seg-1-f3-v1.m4s" -i "concat:../init-f1-a1.mp4|../seg-1-f1-a1.m4s" -c copy -map 0:v -map 1:a ../output_segment1_with_audio.mp4

### Download subtitles

In [None]:
BASE_URL = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/"
for i in range(1, 233):
    filename = f"seg-{i}-f6.vtt"
    url = os.path.join(BASE_URL, filename)
    response = requests.get(url)
    with open(os.path.join('video/subtitle_segments', filename), 'wb') as out_file:
        out_file.write(response.content)

# ---> MAKE CODE PARALLEL + ASYNCIO

# Parse subtitles to find relevant topic passages

In [4]:
transcript = []
subtitle_segments = []
video_segments = []
audio_segments = []
seen = set()
for i in range(1, 233):
    filename = f'./video/subtitle_segments/seg-{i}-f6.vtt'
    
    captions = webvtt.read(filename)
    
    for caption in captions:
        print("START: ", caption.start)
        print("END: ", caption.end)
        print("TEXT: ", caption.text)
        print("----------")
        if caption.text not in seen and not seen.add(caption.text):
            transcript.append(caption)
            subtitle_segments.append(filename)
            video_segments.append(f'./video/video_segments/seg-{i}-f3-v1.m4s')
            audio_segments.append(f'./video/subtitle_segments/seg-{i}-f1-a1.m4s')

START:  00:00:00.000
END:  00:00:13.948
TEXT:  ...
----------
START:  00:00:00.000
END:  00:00:13.948
TEXT:  ...
----------
START:  00:00:00.000
END:  00:00:13.948
TEXT:  ...
----------
START:  00:00:00.000
END:  00:00:13.948
TEXT:  ...
----------
START:  00:00:00.000
END:  00:00:13.948
TEXT:  ...
----------
START:  00:00:14.228
END:  00:00:16.428
TEXT:  -Mesdames, Messieurs,
bonsoir et bienvenue.
----------
START:  00:00:14.228
END:  00:00:16.428
TEXT:  -Mesdames, Messieurs,
bonsoir et bienvenue.
----------
START:  00:00:16.508
END:  00:00:18.868
TEXT:  Insultes, menaces et intimidation,
----------
START:  00:00:18.948
END:  00:00:22.228
TEXT:  c'est ce que vivent
les étudiants de l'EPLF
----------
START:  00:00:22.308
END:  00:00:25.348
TEXT:  qui émettent un avis contraire
au mouvement propalestinien.
----------
START:  00:00:22.308
END:  00:00:25.348
TEXT:  qui émettent un avis contraire
au mouvement propalestinien.
----------
START:  00:00:25.428
END:  00:00:31.388
TEXT:  Le prési

MalformedFileError: Invalid format

In [5]:
print(len(transcript))

459


In [6]:
subtitle_segments

['./video/subtitle_segments/seg-1-f6.vtt',
 './video/subtitle_segments/seg-5-f6.vtt',
 './video/subtitle_segments/seg-6-f6.vtt',
 './video/subtitle_segments/seg-6-f6.vtt',
 './video/subtitle_segments/seg-6-f6.vtt',
 './video/subtitle_segments/seg-7-f6.vtt',
 './video/subtitle_segments/seg-8-f6.vtt',
 './video/subtitle_segments/seg-8-f6.vtt',
 './video/subtitle_segments/seg-8-f6.vtt',
 './video/subtitle_segments/seg-9-f6.vtt',
 './video/subtitle_segments/seg-9-f6.vtt',
 './video/subtitle_segments/seg-9-f6.vtt',
 './video/subtitle_segments/seg-10-f6.vtt',
 './video/subtitle_segments/seg-11-f6.vtt',
 './video/subtitle_segments/seg-11-f6.vtt',
 './video/subtitle_segments/seg-11-f6.vtt',
 './video/subtitle_segments/seg-12-f6.vtt',
 './video/subtitle_segments/seg-12-f6.vtt',
 './video/subtitle_segments/seg-13-f6.vtt',
 './video/subtitle_segments/seg-14-f6.vtt',
 './video/subtitle_segments/seg-14-f6.vtt',
 './video/subtitle_segments/seg-14-f6.vtt',
 './video/subtitle_segments/seg-14-f6.vtt',


In [7]:
video_segments[18]

'./video/video_segments/seg-13-f3-v1.m4s'

In [8]:
video_segments[49]

'./video/video_segments/seg-30-f3-v1.m4s'

In [9]:
for caption in transcript:
    print(caption.text)

...
-Mesdames, Messieurs,
bonsoir et bienvenue.
Insultes, menaces et intimidation,
c'est ce que vivent
les étudiants de l'EPLF
qui émettent un avis contraire
au mouvement propalestinien.
Le président de l'EPFL,
Martin Vetterli, est avec nous.
Une manifestation spectaculaire
contre la géothermie
dans la commune de Haute-Sorne (JU).
Des militants en colère
et des dégâts considérables.
A deux semaines
des élections européennes,
alors qu'on parle
d'un excellent score tout à droite,
l'AfD a été exclu de son groupe
parlementaire à Bruxelles.
Sa tête de liste
enchaîne les dérapages.
Nous percerons un des mystères
qui entourent les pyramides.
Comment ont-elles été construites ?
Il y a une réponse récente.
60 ans et toujours aussi glamour.
L'icône du rock Lenny Kravitz
sort un nouvel album.
Le climat est délétère à l'EPFL
sur fond de guerre
entre Israël et le Hamas.
Les étudiants qui émettent
un avis contraire
au mouvement propalestinien
sont menacés.
Des jeunes ont peur
d'aller sur le campus.


### LLM topic identification

In [11]:
url = "https://il.srgssr.ch/integrationlayer/2.0/mediaComposition/byUrn/urn:rts:video:14926430.json?onlyChapters=false&vector=portalplay"
res = requests.get(url)

In [12]:
metadata = json.loads(res.text)
metadata

{'chapterUrn': 'urn:rts:video:14926430',
 'episode': {'id': '14822978',
  'title': '19h30',
  'publishedDate': '2024-05-24T19:30:00+02:00',
  'imageUrl': 'https://www.rts.ch/2024/05/25/14/48/14926429.image/16x9',
  'imageTitle': '19h30 [RTS]'},
 'show': {'id': '105932',
  'vendor': 'RTS',
  'transmission': 'TV',
  'urn': 'urn:rts:show:tv:105932',
  'title': '19h30',
  'lead': "L'édition du soir du téléjournal.",
  'imageUrl': 'https://www.rts.ch/2023/09/28/17/48/14256748.image/16x9',
  'imageTitle': 'RTS Info - Le 19h30 [RTS]',
  'bannerImageUrl': 'https://www.rts.ch/2023/09/28/17/48/14256748.image/3x1',
  'posterImageUrl': 'https://www.rts.ch/2023/08/22/16/37/14257151.image/2x3',
  'posterImageIsFallbackUrl': False,
  'primaryChannelId': '143932a79bb5a123a646b68b1d1188d7ae493e5b',
  'primaryChannelUrn': 'urn:rts:channel:tv:143932a79bb5a123a646b68b1d1188d7ae493e5b',
  'availableAudioLanguageList': [{'locale': 'fr', 'language': 'Français'}],
  'availableVideoQualityList': ['SD', 'HD'],


In [13]:
segment_list = metadata["chapterList"][0]["segmentList"]
segment_list

[{'id': '14926404',
  'mediaType': 'VIDEO',
  'vendor': 'RTS',
  'urn': 'urn:rts:video:14926404',
  'title': "À l'EPFL, des étudiants pro-palestiniens mais modérés, menacés par les plus radicaux",
  'imageUrl': 'https://www.rts.ch/2024/05/25/14/48/14926395.image/16x9',
  'imageTitle': "À l'EPFL, des étudiants pro-palestiniens mais modérés, menacés par les plus radicaux [RTS]",
  'type': 'CLIP',
  'date': '2024-05-24T19:30:00+02:00',
  'duration': 137200,
  'validFrom': '2024-05-24T20:04:12+02:00',
  'playableAbroad': True,
  'displayable': True,
  'fullLengthUrn': 'urn:rts:video:14926430',
  'position': 1,
  'noEmbed': False,
  'analyticsMetadata': {'media_segment': "À l'EPFL, des étudiants pro-palestiniens mais modérés, menacés par les plus radicaux",
   'media_type': 'Video',
   'media_segment_id': '14926404',
   'media_episode_length': '1831',
   'media_segment_length': '137',
   'media_number_of_segment_selected': '1',
   'media_number_of_segments_total': '13',
   'media_duration_c

In [14]:
for i, segment in enumerate(segment_list):
    print(f"{i}: ", segment["title"])

0:  À l'EPFL, des étudiants pro-palestiniens mais modérés, menacés par les plus radicaux
1:  Le président de l'EPFL Martin Vetterli revient sur le sentiment de peur ressenti par certains étudiants de l'établissement, critiques au mouvement pro-palestinien
2:  Haute-Sorne (JU): la manifestation contre la géothermie profonde a dégénéré
3:  Drogue: un deuxième espace de consommation sécurisé ouvre à Lausanne
4:  Le journaliste Yoan Rithner revient sur la grève entamée vendredi par le personnel de Vetropack à Saint-Prex dans le canton de Vaud
5:  À Neuchâtel, un ex-entraîneur de football amateur condamné pour abus sexuels sur des enfants
6:  Un candidat de l'AFD aux élections européennes crée le malaise avec des propos choquants
7:  Isabelle Ory, correspondante de la RTS auprès de l'UE, analyse l'impact de l'exclusion de l'Afd pour l'extrême droite, après les déclarations chocs de l'un des candidats du parti allemand
8:  Une semaine avant la fin de son procès à New York, le candidat républ

In [15]:
clip_topics = {i:segment["title"] for i, segment in enumerate(segment_list)}
clip_topics

{0: "À l'EPFL, des étudiants pro-palestiniens mais modérés, menacés par les plus radicaux",
 1: "Le président de l'EPFL Martin Vetterli revient sur le sentiment de peur ressenti par certains étudiants de l'établissement, critiques au mouvement pro-palestinien",
 2: 'Haute-Sorne (JU): la manifestation contre la géothermie profonde a dégénéré',
 3: 'Drogue: un deuxième espace de consommation sécurisé ouvre à Lausanne',
 4: 'Le journaliste Yoan Rithner revient sur la grève entamée vendredi par le personnel de Vetropack à Saint-Prex dans le canton de Vaud',
 5: 'À Neuchâtel, un ex-entraîneur de football amateur condamné pour abus sexuels sur des enfants',
 6: "Un candidat de l'AFD aux élections européennes crée le malaise avec des propos choquants",
 7: "Isabelle Ory, correspondante de la RTS auprès de l'UE, analyse l'impact de l'exclusion de l'Afd pour l'extrême droite, après les déclarations chocs de l'un des candidats du parti allemand",
 8: "Une semaine avant la fin de son procès à New

In [16]:
system_prompt = """You are an expert at identifying topics related to: "Palestine".

This topic can be explored through multiple angles, including but not limited to:

    - Israel-Hamas war
    - Israel
    - Hamas
    - PLO
    - Palestine
    - Gaza
    - West Bank
    - Occupied Territories

Given the dictionary of titles below, your task is to identify keys that reference the topic and its angles. Return a list of keys refering to "Palestine".
RETURN ONLY A LIST OF INT.

**Example**: 

TITLES: {{0: "À l'EPFL, des étudiants pro-palestiniens mais modérés, menacés par les plus radicaux",
 1: "Le président de l'EPFL Martin Vetterli revient sur le sentiment de peur ressenti par certains étudiants de l'établissement, critiques au mouvement pro-palestinien",
 2: 'Haute-Sorne (JU): la manifestation contre la géothermie profonde a dégénéré',
 3: 'Drogue: un deuxième espace de consommation sécurisé ouvre à Lausanne',
 4: 'Le journaliste Yoan Rithner revient sur la grève entamée vendredi par le personnel de Vetropack à Saint-Prex dans le canton de Vaud',
 5: 'À Neuchâtel, un ex-entraîneur de football amateur condamné pour abus sexuels sur des enfants',
 6: "Un candidat de l'AFD aux élections européennes crée le malaise avec des propos choquants",
 7: "Isabelle Ory, correspondante de la RTS auprès de l'UE, analyse l'impact de l'exclusion de l'Afd pour l'extrême droite, après les déclarations chocs de l'un des candidats du parti allemand",
 8: "Une semaine avant la fin de son procès à New York, le candidat républicain Donald Trump s'est rendu dans le Bronx pour y chercher des électeurs",
 9: '"Comment ça va la Suisse?", une enquête comme un dialogue sur un quai de gare',
 10: 'Triste tigre", ovni littéraire de Neige Sinno, primé par le choix Goncourt de la Suisse',
 11: "Un bras du Nil aujourd'hui asséché aurait servi de voie d'accès pour construire les pyramides",
 12: 'Le musicien américain Lenny Kravitz a sorti vendredi son douzième album "Blue Electric Light", qui allie le glamour et le rock, principales caractéristiques de l\'artiste'}}
    
TOPICS:[0, 1]

TITLES: {text}
TOPICS:"""

In [17]:
messages = [{"role": "system", "content": system_prompt.format(text=clip_topics)},]
topic_keys = llm_client.chat.completions.create(
                model="gpt-4o",
                stream=False,
                temperature=0,
                top_p=0.95,
                max_tokens=4096,
                messages=messages
            ).choices[0].message.content
print(topic_keys)  

[0, 1]


In [18]:
topic_keys = ast.literal_eval(topic_keys)

In [19]:
transcript_ts = [caption.end_in_seconds for caption in transcript]
transcript_ts

[13,
 16,
 18,
 22,
 25,
 31,
 34,
 37,
 41,
 44,
 47,
 51,
 54,
 61,
 63,
 66,
 69,
 72,
 80,
 82,
 85,
 87,
 97,
 102,
 105,
 108,
 113,
 115,
 123,
 125,
 130,
 133,
 137,
 145,
 148,
 150,
 154,
 158,
 162,
 172,
 175,
 178,
 181,
 184,
 195,
 199,
 201,
 204,
 211,
 215,
 221,
 227,
 234,
 237,
 241,
 244,
 246,
 250,
 254,
 258,
 261,
 265,
 272,
 277,
 281,
 285,
 290,
 292,
 296,
 299,
 304,
 308,
 310,
 314,
 317,
 323,
 327,
 330,
 335,
 339,
 343,
 347,
 349,
 353,
 355,
 361,
 366,
 372,
 375,
 378,
 384,
 387,
 393,
 398,
 402,
 406,
 409,
 415,
 420,
 424,
 428,
 431,
 435,
 439,
 443,
 445,
 448,
 451,
 458,
 460,
 465,
 467,
 469,
 471,
 473,
 479,
 482,
 484,
 489,
 492,
 494,
 496,
 498,
 502,
 504,
 510,
 515,
 519,
 522,
 525,
 528,
 532,
 536,
 543,
 547,
 552,
 555,
 558,
 562,
 564,
 567,
 572,
 574,
 578,
 583,
 588,
 591,
 593,
 596,
 599,
 604,
 607,
 610,
 612,
 614,
 615,
 621,
 624,
 628,
 630,
 633,
 640,
 649,
 652,
 656,
 658,
 660,
 664,
 667,
 670,
 67

In [41]:
transcript

[<Caption start='00:00:00.000' end='00:00:13.948' text='...' identifier=None>,
 <Caption start='00:00:14.228' end='00:00:16.428' text='-Mesdames, Messieurs,\\nbonsoir et bienvenue.' identifier=None>,
 <Caption start='00:00:16.508' end='00:00:18.868' text='Insultes, menaces et intimidation,' identifier=None>,
 <Caption start='00:00:18.948' end='00:00:22.228' text="c'est ce que vivent\\nles étudiants de l'EPLF" identifier=None>,
 <Caption start='00:00:22.308' end='00:00:25.348' text='qui émettent un avis contraire\\nau mouvement propalestinien.' identifier=None>,
 <Caption start='00:00:25.428' end='00:00:31.388' text="Le président de l'EPFL,\\nMartin Vetterli, est avec nous." identifier=None>,
 <Caption start='00:00:32.148' end='00:00:34.788' text='Une manifestation spectaculaire' identifier=None>,
 <Caption start='00:00:34.868' end='00:00:37.549' text='contre la géothermie\\ndans la commune de Haute-Sorne (JU).' identifier=None>,
 <Caption start='00:00:37.589' end='00:00:41.189' text='D

In [44]:
transcript[18].start

'00:01:16.150'

212

In [49]:
start = transcript[relevant_clip_topics[key]["start_idx"]].start_in_seconds
[f"start: {x.start_in_seconds - start} end: {x.end_in_seconds - start} text: {x.text}" for x in transcript[relevant_clip_topics[key]["start_idx"]:relevant_clip_topics[key]["end_idx"]]]

['start: 0 end: 3 text: -Martin Vetterli, bonsoir.',
 'start: 3 end: 9 text: Le Temps révèle que des étudiants\nde votre établissement',
 'start: 9 end: 15 text: sont harcelés car ils pensent\ndifféremment, et parlent de paix.',
 'start: 15 end: 22 text: Un étudiant est menacé de mort\nsur les réseaux sociaux:',
 'start: 22 end: 25 text: "il faut tuer Nicolas et son gang".',
 'start: 25 end: 29 text: Une autre est intimidée:\n"on a vu où tu étais assise".',
 'start: 29 end: 32 text: Quelle est votre réaction ?',
 'start: 32 end: 34 text: - Je suis très triste',
 "start: 34 end: 38 text: qu'il y a une telle tension\nsur le campus.",
 'start: 38 end: 42 text: Ce sont les excès\ndes réseaux sociaux.',
 "start: 42 end: 46 text: Si on n'est pas dans le cadre légal,",
 "start: 46 end: 49 text: par rapport aux lois\nsur le racisme et l'antisémitisme,",
 'start: 49 end: 53 text: on donne\nle support à notre communauté.',
 "start: 53 end: 60 text: Nos étudiants ont des canaux où ils\npeuvent ve

In [55]:
relevant_clip_topics = {}
for key in topic_keys:
    relevant_clip_topics[key] = {"title": clip_topics[key]}
    relevant_clip_topics[key]["markIn"] = segment_list[key]["markIn"]/1000
    relevant_clip_topics[key]["markOut"] = segment_list[key]["markOut"]/1000
    idx = [i for i, ts in enumerate(transcript_ts) if relevant_clip_topics[key]["markIn"] <= ts <= relevant_clip_topics[key]["markOut"]]
    relevant_clip_topics[key]["start_idx"] = idx[0]
    relevant_clip_topics[key]["end_idx"] = idx[-1]+1
    relevant_clip_topics[key]["transcript"] = "".join([x.text for x in transcript[relevant_clip_topics[key]["start_idx"]:relevant_clip_topics[key]["end_idx"]]])
    start = transcript[relevant_clip_topics[key]["start_idx"]].start_in_seconds - 6
    relevant_clip_topics[key]["transcript_ts"] = "\n".join([f"start: {x.start_in_seconds - start} end: {x.end_in_seconds - start} text: {x.text}" for x in transcript[relevant_clip_topics[key]["start_idx"]:relevant_clip_topics[key]["end_idx"]]])
    relevant_clip_topics[key]["video_start"] = video_segments[relevant_clip_topics[key]["start_idx"]]
    relevant_clip_topics[key]["video_end"] = video_segments[relevant_clip_topics[key]["end_idx"]]
    relevant_clip_topics[key]["video_start_idx"] = int(relevant_clip_topics[key]["video_start"].split("seg-")[-1].split("-")[0])
    relevant_clip_topics[key]["video_end_idx"] = int(relevant_clip_topics[key]["video_end"].split("seg-")[-1].split("-")[0])
    relevant_clip_topics[key]["audio_start"] = audio_segments[relevant_clip_topics[key]["start_idx"]]
    relevant_clip_topics[key]["audio_end"] = audio_segments[relevant_clip_topics[key]["end_idx"]]
    relevant_clip_topics[key]["subtitle_start"] = subtitle_segments[relevant_clip_topics[key]["start_idx"]]
    relevant_clip_topics[key]["subtitle_end"] = subtitle_segments[relevant_clip_topics[key]["end_idx"]]

relevant_clip_topics

{0: {'title': "À l'EPFL, des étudiants pro-palestiniens mais modérés, menacés par les plus radicaux",
  'markIn': 75.52,
  'markOut': 212.72,
  'start_idx': 18,
  'end_idx': 49,
  'transcript': 'Le climat est délétère à l\'EPFLsur fond de guerre\nentre Israël et le Hamas.Les étudiants qui émettent\nun avis contraireau mouvement propalestinien\nsont menacés.Des jeunes ont peur\nd\'aller sur le campus.-Si ce groupe de discussion\nsur la guerre au Moyen-Orientpar des étudiants de l\'EPFL\na été depuis supprimé,il en subsiste\nces captures d\'écran.Des menaces de mort dirigées\ncontre des étudiantsqui souhaitaient apaiser\ndes tensions.Ces révélations du journal Le Tempsdécrivent une radicalisation\nsur le campus de l\'EPFL.Cet étudiant visé ne s\'affole pas,\nmais avoue être sur ses gardes.-Une limite a été franchieparce que c\'est un événement\nqui n\'a jamais été vu sur le campus.On a peur,\non regarde un peu derrière nous.-La direction de l\'EPFL\norganisait cet après-midiune discussio

# create clips

In [56]:
import subprocess
import os

bash_script_content = '''#!/bin/bash

# Loop from 1 to 107
for i in {1..107}
do
  # Use ffmpeg to create separate video with audio
  ffmpeg -i "concat:video/init-f3-v1.mp4|video/video_segments/seg-$i-f3-v1.m4s" -i "concat:video/init-f1-a1.mp4|video/audio_segments/seg-$i-f1-a1.m4s" -c copy -map 0:v -map 1:a "video/clips/output_segment${i}_with_audio.mp4" -nostdin

  # Optionally print progress
  echo "Created output_segment${i}_with_audio.mp4"
done
'''

# Write the bash script to a file
with open('video/video_segments/script.sh', 'w') as file:
    file.write(bash_script_content)

In [35]:
# Ensure the script is executable
os.chmod('video/video_segments/script.sh', 0o775)

# Run the bash script using subprocess
result = subprocess.run(['video/video_segments/script.sh'], capture_output=True, text=True)

# Print the output and errors if any
print(result.stdout)
print(result.stderr)

Created output_segment1_with_audio_and_subs.mp4
Created output_segment2_with_audio_and_subs.mp4
Created output_segment3_with_audio_and_subs.mp4
Created output_segment4_with_audio_and_subs.mp4
Created output_segment5_with_audio_and_subs.mp4
Created output_segment6_with_audio_and_subs.mp4
Created output_segment7_with_audio_and_subs.mp4
Created output_segment8_with_audio_and_subs.mp4
Created output_segment9_with_audio_and_subs.mp4
Created output_segment10_with_audio_and_subs.mp4
Created output_segment11_with_audio_and_subs.mp4
Created output_segment12_with_audio_and_subs.mp4
Created output_segment13_with_audio_and_subs.mp4
Created output_segment14_with_audio_and_subs.mp4
Created output_segment15_with_audio_and_subs.mp4
Created output_segment16_with_audio_and_subs.mp4
Created output_segment17_with_audio_and_subs.mp4
Created output_segment18_with_audio_and_subs.mp4
Created output_segment19_with_audio_and_subs.mp4
Created output_segment20_with_audio_and_subs.mp4
Created output_segment21_with

In [36]:
idx_keep = []
for clip in relevant_clip_topics.values():
    idx_keep.extend([i for i in range(clip["video_start_idx"], clip["video_end_idx"]+1)])

In [37]:
for clip in glob.glob("video/clips/*.mp4"):
    idx = int(re.findall(r'segment(\d+)', clip)[0])
    if idx not in idx_keep:
        os.remove(clip)

### Merge clips by topic

In [40]:
import ffmpeg

for i, clip in enumerate(relevant_clip_topics.values()):

    # List of video clips to merge
    input_clips = [f"video/clips/output_segment{i}_with_audio_and_burned_subs.mp4" for i in range(clip["video_start_idx"], clip["video_end_idx"]+1)]
    
    # Create a file containing the list of video clips in the format required by FFmpeg
    with open('file_list.txt', 'w') as f:
        for i_clip in input_clips:
            f.write(f"file '{i_clip}'\n")

    # Use FFmpeg to merge the clips
    (
        ffmpeg
        .input('./file_list.txt', format='concat', safe=0)  # 'concat' format for concatenation
        .output(f'video/final_output/output_{i}.mp4', c='copy')  # Output file
        .run()
    )

    print("Merging complete!")

ffmpeg version 7.0.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.3.9.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

Merging complete!
Merging complete!


ffmpeg version 7.0.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.3.9.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

# Identify tactics

In [57]:
system_prompt = """

In the context of protecting the agenda of those in power, the term "chien de garde" (or "guard dog") in journalism refers to media outlets or journalists who act in the interests of the powerful, rather than serving the public. This concept is the opposite of the traditional watchdog role, as these "guard dogs" may:

- **Promote the interests of the powerful**: They may prioritize stories that align with the agendas of those in power, often neglecting or downplaying issues that could be detrimental to these interests.

- **Suppress critical information**: They may ignore, underreport, or discredit information and perspectives that could challenge or undermine the status quo or the power of influential entities.

- **Propagate propaganda**: They might disseminate biased or misleading information to shape public opinion in favor of the powerful, often blurring the line between journalism and public relations.

- **Maintain the status quo**: By focusing on narratives that reinforce existing power structures and social hierarchies, they contribute to the preservation of these structures.

This concept critiques the potential for media to become complicit in maintaining power dynamics and failing to serve the broader public interest. It highlights the tension between journalistic integrity and the influence of powerful stakeholders on media content and practices.

Media outlets or journalists who act as "chiens de garde" (guard dogs) protecting the agenda of the powerful may use various tactics to shape public perception and influence narratives. Here are some of the common tactics employed:

1. **Passive Voice**

Purpose: To obscure the subject responsible for an action, thereby diminishing accountability.

Example: "Mistakes were made" instead of "The company made mistakes."
   
3. **Guided Questions**

Purpose: To lead the audience to a predetermined conclusion or to frame an issue in a specific way.

Example: "Don't you think that the new policy will benefit the economy?" instead of "What are the potential impacts of the new policy?"

3. **Selective Reporting**

Purpose: To highlight certain information while ignoring other relevant facts, shaping the narrative to favor the powerful.

Example: Focusing extensively on the economic benefits of a new law while ignoring its negative social impacts.

4. **Euphemisms**

Purpose: To downplay or soften the perception of negative actions or outcomes.

Example: Using "collateral damage" to refer to civilian casualties in a military operation.

5. **Framing**

Purpose: To present information in a way that influences interpretation and opinion.

Example: Describing a protest as a "riot" versus a "demonstration" can evoke different responses from the audience.

6. **Omission**

Purpose: To leave out crucial information that might present the powerful in a negative light.

Example: Reporting on a company's financial success without mentioning its environmental violations.

7. **Cherry-Picking Data**

Purpose: To select specific data points that support a particular narrative while ignoring data that contradicts it.

Example: Highlighting a temporary drop in unemployment rates without discussing the overall trend of increasing job insecurity.

8. **Loaded Language**

Purpose: To use emotionally charged words to influence audience perception.

Example: Referring to a policy as "radical" or "extreme" to create a negative impression.

9. **Ad Hominem Attacks**

Purpose: To discredit critics or opposition by attacking their character rather than addressing their arguments.

Example: Labeling a whistleblower as "disgruntled" or "unreliable" instead of addressing the substance of their claims.

10. **False Balance**

Purpose: To present two sides of an issue as equally valid, even when the evidence heavily supports one side.

Example: Giving equal time to climate change scientists and climate change deniers, despite the overwhelming scientific consensus.

11. **Sensationalism**

Purpose: To use shocking or sensational headlines and stories to attract attention, often at the expense of accuracy and depth.

Example: "Government Under Siege by Angry Mobs" instead of "Peaceful Protesters Rally for Change."

12. **Astroturfing**

Purpose: To create the illusion of grassroots support for a cause that is actually orchestrated by powerful interests.

Example: Coordinating fake social media campaigns to simulate public support for a controversial policy.

13. **Repetition**

Purpose: To reinforce a particular message or narrative by repeating it frequently.

Example: Constantly repeating a slogan or catchphrase associated with a policy or leader to embed it in the public consciousness.

14. **Appeal to Authority**

Purpose: To strengthen an argument by citing authoritative figures or institutions that support the powerful's agenda.

Example: "Experts agree that this policy is necessary," without specifying which experts or the basis of their agreement.

15. **False Equivalence**

Purpose: To present two unequal things as being equivalent, often to downplay the negative aspects of one side.

Example: Comparing minor ethical lapses of a critic to major scandals involving those in power.

16. **Scapegoating**

Purpose: To divert blame or criticism by attributing problems to a convenient target, often a minority group or external entity.

Example: Blaming economic woes on immigrants or foreign competitors rather than domestic policy failures.

17. **Straw Man Argument**

Purpose: To misrepresent or oversimplify an opponent's position, making it easier to attack.

Example: "Opponents of this policy want to abandon progress," ignoring the nuanced arguments of critics.

18. **Glittering Generalities**

Purpose: To use vague, emotionally appealing phrases that lack substantive content but create a positive impression.

Example: Promising "a brighter future" without providing specific plans or actions.

19. **Deflection**

Purpose: To avoid addressing uncomfortable questions or criticisms by shifting the focus to a different issue.

Example: When asked about corruption, a politician responds by talking about their achievements in a completely different area.

20. **Manufactured Outrage**

Purpose: To generate emotional responses over trivial or irrelevant issues to distract from more important matters.

Example: Focusing media attention on a celebrity scandal during a significant political crisis.

21. **Control of Information Flow**

Purpose: To limit access to information by controlling who gets to speak, what gets reported, and how it's presented.

Example: Only allowing friendly journalists access to press conferences and events.

22. **Echo Chambers**

Purpose: To reinforce existing beliefs by ensuring that people are exposed only to information that confirms their views.

Example: Media outlets tailoring content to fit the biases and preferences of their audience, creating a feedback loop.

23. **Tokenism**

Purpose: To give the appearance of inclusivity or reform by making superficial changes while maintaining the status quo.

Example: Highlighting a few minority voices or implementing minor policy tweaks while ignoring systemic issues.

24. **Image Management**

Purpose: To carefully craft and control the public image of those in power through PR campaigns, selective appearances, and staged events.

Example: Politicians visiting schools and hospitals for photo ops while their policies may harm those very institutions.

25. **Agenda Setting**

Purpose: To shape the public agenda by prioritizing certain topics over others, thereby influencing what people think about.

Example: Giving extensive coverage to economic growth while underreporting environmental degradation.

26. **Distraction Techniques**

Purpose: To divert attention from important issues by focusing on trivial or sensational topics.

Example: Extensive coverage of celebrity gossip during critical political events.

27. **False Consensus**

Purpose: To create the illusion that the majority of people support a particular viewpoint or policy.

Example: Polling data presented in a misleading way to suggest overwhelming public support.

28. **Minimization**

Purpose: To downplay the severity or significance of an issue or event.

Example: Referring to a major environmental disaster as a "minor spill."

29. **Use of Flawed Studies**

Purpose: To present biased or poorly conducted research as credible to support a specific agenda.

Example: Citing industry-funded studies that show no harm from a product known to be dangerous.

30. **Smokescreening**

Purpose: To obscure the truth by overwhelming the audience with irrelevant information.

Example: Flooding news cycles with minor updates to drown out significant but unfavorable reports.

31. **Appeal to Emotion**

Purpose: To manipulate audience feelings rather than using logical arguments.

Example: Using heart-wrenching stories to garner support for a policy without discussing its broader implications.

32. **Rebranding Negative Concepts**

Purpose: To change the language used to describe a controversial policy to make it more palatable.

Example: Referring to "tax cuts for the wealthy" as "tax relief."

33. **Hasty Generalizations**

Purpose: To make broad claims based on limited evidence to mislead the audience.

Example: Using an isolated incident to justify widespread policy changes.

34. **Marginalization**

Purpose: To sideline or discredit voices that challenge the dominant narrative.

Example: Labeling critics as "fringe" or "extremists" to delegitimize their viewpoints.

35. **Overemphasis on Negatives of Opponents**

Purpose: To focus disproportionately on the flaws or mistakes of those who oppose the agenda of the powerful.

Example: Highlighting every minor gaffe of a political opponent while ignoring substantive policy issues.

36. **Manufactured Statistics**

Purpose: To present misleading or cherry-picked data to support a narrative.

Example: Using selective unemployment statistics that don't account for underemployment or workforce participation rates.

37. **Strategic Ambiguity**

Purpose: To be deliberately vague to avoid accountability or detailed scrutiny.

Example: Announcing a policy with general statements without specifying the concrete measures or implications.

38. **Discrediting Whistleblowers**

Purpose: To undermine the credibility of those who expose misconduct.

Example: Portraying whistleblowers as disgruntled employees or traitors.

39. **False Dilemma**

Purpose: To present a situation as having only two choices, ignoring other viable options.

Example: "Either we implement this policy or face economic ruin," disregarding alternative solutions.

40. **Echoing Talking Points**

Purpose: To repeat specific phrases or ideas consistently across different media outlets to solidify them in public discourse.

Example: Coordinated use of certain buzzwords or slogans in both news articles and political speeches.

41. **Misleading Headlines**

Purpose: To attract attention with sensational or biased headlines that may not accurately reflect the content of the article.

Example: A headline that exaggerates or distorts the findings of a report.

42. **Conflation**

Purpose: To combine two unrelated issues to confuse the audience or obscure the truth.

Example: Linking unrelated policy failures to a particular political ideology to create a false association.

43. **False Balance in Scientific Reporting**

Purpose: To give equal weight to scientific consensus and fringe theories, misleading the public about the level of disagreement.

Example: Presenting climate change deniers' views alongside those of the vast majority of scientists without context.

44. **Strategic Leaks**

Purpose: To release information selectively to control the narrative or distract from other issues.

Example: Releasing favorable news just before a damaging report is due to be published.

45. **Creating Pseudo-Events**

Purpose: To stage events specifically designed to attract media coverage and shape public perception.

Example: Politicians staging visits to disaster areas or community centers to showcase their concern and involvement.

46. **Creating Scapegoats**

Purpose: To divert blame for societal problems onto specific individuals or groups.

Example: Blaming economic downturns on immigrants or minority groups rather than systemic issues.

47. **Amplification of Fear**

Purpose: To use fear to manipulate public opinion and justify certain actions or policies.

Example: Highlighting rare but sensational crimes to justify increased security measures.

48. **Disinformation Campaigns**

Purpose: To deliberately spread false information to confuse or mislead the public.

Example: Spreading fake news or conspiracy theories to discredit opposition or distract from real issues.

49. **Undercover Advertising (Native Advertising)**

Purpose: To present advertisements as genuine news content, blurring the lines between editorial and commercial content.

Example: Sponsored content that looks like a news article but promotes a product or viewpoint.

50. **Use of Surrogates**

Purpose: To have third parties or pundits express views that align with the agenda, providing a veneer of independence.

Example: Using think-tank experts or analysts who have ties to powerful interests to present biased opinions as objective analysis.

51. **Policy Laundering**

Purpose: To introduce controversial policies through less visible channels or secondary legislation to avoid public scrutiny.

Example: Embedding significant policy changes within larger, less controversial bills.

52. **Red Herring**

Purpose: To introduce irrelevant information to distract from the main issue.

Example: Bringing up past scandals or unrelated issues during discussions of current policy failures.

53. **False Attribution**

Purpose: To attribute statements or statistics to credible sources that did not actually provide them.

Example: Claiming that "studies show" a certain fact without referencing specific, credible studies.

54. **Misrepresentation of Opponents’ Views**

Purpose: To distort or mischaracterize the arguments of opponents to make them easier to refute.

Example: Oversimplifying a critic’s nuanced argument to make it seem unreasonable.

55. **Emotional Appeals to Tradition**

Purpose: To argue that certain policies or actions are justified because they are traditional or have always been done that way.

Example: Defending a discriminatory practice by citing historical precedent without addressing current ethical standards.

56. **Pseudo-Expertise**

Purpose: To present individuals with dubious qualifications as experts to lend credibility to a specific narrative.

Example: Featuring "experts" with no relevant background or education to support controversial claims.

57. **Selective Quoting**

Purpose: To quote out of context to misrepresent someone's statement or position.

Example: Using a snippet from a speech that changes its intended meaning.

58. **Deflection with Humor or Satire**

Purpose: To use humor to trivialize serious issues or criticism.

Example: Making jokes about a serious allegation to downplay its significance and avoid addressing it substantively.

59. **Overreliance on Official Sources**

Purpose: To use statements from authorities without questioning or independently verifying them.

Example: Reporting government statements as fact without investigating or seeking alternative viewpoints.

60. **Sensationalizing Successes**

Purpose: To exaggerate the achievements of those in power to build a positive public image.

Example: Extensively covering minor policy successes while ignoring major failures.

61. **Personalization**

Purpose: To focus on individual stories or personalities to distract from systemic issues.

Example: Highlighting a single individual’s success story to imply that systemic barriers do not exist.

62. **Credentialism**

Purpose: To emphasize credentials over the validity of arguments or evidence.

Example: Dismissing valid critiques by pointing out the critic's lack of formal qualifications.

63. **False Clarity**

Purpose: To present complex issues as simple and straightforward to manipulate public understanding.

Example: Reducing a complicated policy debate to a simplistic dichotomy.

64. **Creating Urgency**

Purpose: To invoke a sense of urgency to push through policies without thorough debate or consideration.

Example: Framing a policy decision as needing immediate action to avoid disaster, thereby limiting public discourse.

65. **Using Anecdotal Evidence**

Purpose: To use personal stories or isolated incidents to make broad generalizations.

Example: Using a single positive outcome to argue for a policy’s effectiveness, ignoring broader statistical data.

66. **Normalization of Corruption**

Purpose: To present corrupt practices as standard and unavoidable in order to desensitize the public.

Example: Reporting on corruption as a commonplace and expected part of political life, reducing public outrage.

67. **Demonizing Critics**

Purpose: To portray critics as untrustworthy, malicious, or extremist.

Example: Labeling activists or whistleblowers as anti-progress or enemies of the state.

68. **Creating False Precedents**

Purpose: To cite non-existent or misrepresented precedents to justify current actions.

Example: Claiming that a policy has historical backing when it does not, or misrepresenting past events.

These tactics can subtly manipulate public perception, maintaining the power of influential entities while undermining the role of the media as an independent watchdog.


"""

In [75]:
def generate_text(prompt):
    response = llm_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ],
        max_tokens=4096,  # Adjust this value based on your needs
        n=1,
        stop=None,
        temperature=0,  # Adjust the temperature to control the creativity of the output
    )
    
    return response.choices[0].message.content

# TO DO: add timestamps of tactics for image caption text generation alignment

# TO DO: add tag of people speaking (présentatrice, invité)

In [76]:
tactics = []
for clip in relevant_clip_topics.values():

    # Define the prompt
    prompt = f"""YOU MUST NOW EXTRACT ANY TACTIC YOU FIND IN THE NEWS TRANSCRIPT PROVIDED BELOW. BE VERY SPECIFIC AND ONLY MENTION A TACTIC IF IT IS CLEARLY PRESENT IN THE TRANSCRIPT.
    ALWAYS TAKE INTO ACCOUNT THE PRESENTATOR'S WAY OF ASKING QUESTIONS AS WELL AS THE GUEST's ANSWER IF APPLICABLE (are they oriented?, etc.).
    Base your extraction on the "text" field of the transcript. Consider previous and following lines to extract tactics.
    Output format: Output a list of dict with fields:
        - {{'tactic': extracted tactic,
            'passage': exact retranscription of passage containing tactic,
            'start': start value (int) of passage,
            'end': end value (int) of passage,
            'explanation': why the tactic was extracted
            }}
    
    NEWS TRANSCRIPT: {clip["transcript_ts"]}
    TACTICS:"""

    # Generate the text
    generated_text = generate_text(prompt)
    
    print(clip["transcript_ts"])
    print(generated_text)
    print("-------------")
    tactics.append(generated_text)

start: 6 end: 10 text: Le climat est délétère à l'EPFL
start: 10 end: 12 text: sur fond de guerre
entre Israël et le Hamas.
start: 13 end: 15 text: Les étudiants qui émettent
un avis contraire
start: 15 end: 17 text: au mouvement propalestinien
sont menacés.
start: 17 end: 27 text: Des jeunes ont peur
d'aller sur le campus.
start: 29 end: 32 text: -Si ce groupe de discussion
sur la guerre au Moyen-Orient
start: 32 end: 35 text: par des étudiants de l'EPFL
a été depuis supprimé,
start: 35 end: 38 text: il en subsiste
ces captures d'écran.
start: 41 end: 43 text: Des menaces de mort dirigées
contre des étudiants
start: 43 end: 45 text: qui souhaitaient apaiser
des tensions.
start: 51 end: 53 text: Ces révélations du journal Le Temps
start: 53 end: 55 text: décrivent une radicalisation
sur le campus de l'EPFL.
start: 55 end: 60 text: Cet étudiant visé ne s'affole pas,
mais avoue être sur ses gardes.
start: 62 end: 63 text: -Une limite a été franchie
start: 63 end: 67 text: parce que c'est

In [77]:
parsed_res = []
for tactic in tactics:
    parsed_res.append(ast.literal_eval(tactic.replace("```json", "").replace("```", "")))

parsed_res

[[{'tactic': 'Sensationalism',
   'passage': "Le climat est délétère à l'EPFL sur fond de guerre entre Israël et le Hamas.",
   'start': 6,
   'end': 12,
   'explanation': "The use of the word 'délétère' (deleterious) and the dramatic context of a war sets a sensational tone, potentially exaggerating the severity of the situation."},
  {'tactic': 'Loaded Language',
   'passage': 'Les étudiants qui émettent un avis contraire au mouvement propalestinien sont menacés.',
   'start': 13,
   'end': 17,
   'explanation': "The phrase 'sont menacés' (are threatened) uses emotionally charged language to evoke a strong reaction from the audience."},
  {'tactic': 'Appeal to Emotion',
   'passage': "Des jeunes ont peur d'aller sur le campus.",
   'start': 17,
   'end': 27,
   'explanation': "This statement appeals to the audience's emotions by highlighting the fear of young students, aiming to elicit sympathy and concern."},
  {'tactic': 'Selective Reporting',
   'passage': 'Des menaces de mort dir

In [78]:
parsed_res[0][0]

{'tactic': 'Sensationalism',
 'passage': "Le climat est délétère à l'EPFL sur fond de guerre entre Israël et le Hamas.",
 'start': 6,
 'end': 12,
 'explanation': "The use of the word 'délétère' (deleterious) and the dramatic context of a war sets a sensational tone, potentially exaggerating the severity of the situation."}

# Add blank space to video frames for tactics display

In [79]:
parsed_res

[[{'tactic': 'Sensationalism',
   'passage': "Le climat est délétère à l'EPFL sur fond de guerre entre Israël et le Hamas.",
   'start': 6,
   'end': 12,
   'explanation': "The use of the word 'délétère' (deleterious) and the dramatic context of a war sets a sensational tone, potentially exaggerating the severity of the situation."},
  {'tactic': 'Loaded Language',
   'passage': 'Les étudiants qui émettent un avis contraire au mouvement propalestinien sont menacés.',
   'start': 13,
   'end': 17,
   'explanation': "The phrase 'sont menacés' (are threatened) uses emotionally charged language to evoke a strong reaction from the audience."},
  {'tactic': 'Appeal to Emotion',
   'passage': "Des jeunes ont peur d'aller sur le campus.",
   'start': 17,
   'end': 27,
   'explanation': "This statement appeals to the audience's emotions by highlighting the fear of young students, aiming to elicit sympathy and concern."},
  {'tactic': 'Selective Reporting',
   'passage': 'Des menaces de mort dir

In [None]:
!pip install --upgrade decorator==4.4.2

In [80]:
from moviepy.editor import VideoFileClip
from PIL import Image, ImageDraw
import numpy as np

# Load the original video
video = VideoFileClip("video/final_output/output_0.mp4")

# Function to add blank space to the right of each frame
def add_blank_space(frame):
    # Get frame dimensions
    height, width, _ = frame.shape
    
    # Create a blank white image with the new width (500 pixels more)
    new_width = width + 500
    new_frame = Image.new('RGB', (new_width, height), (255, 255, 255))
    
    # Convert the original frame to a Pillow image
    original_frame = Image.fromarray(frame)
    
    # Paste the original frame into the new frame (aligned to the left)
    new_frame.paste(original_frame, (0, 0))
    
    # Convert back to a NumPy array
    return np.array(new_frame)

# Apply the function to every frame in the video
new_video = video.fl_image(add_blank_space)

# Set the FPS from the input video, or use a default value (e.g., 25)
fps = video.fps if video.fps else 25

new_video = new_video.set_audio(video.audio)

# Write the modified video to a file, using the fps
new_video.write_videofile("insta/in/output_video_with_blank_space.mp4", codec="libx264", fps=fps, audio_codec="aac")


Moviepy - Building video insta/in/output_video_with_blank_space.mp4.
MoviePy - Writing audio in output_video_with_blank_spaceTEMP_MPY_wvf_snd.mp4


                                                                                                                            

MoviePy - Done.
Moviepy - Writing video insta/in/output_video_with_blank_space.mp4



                                                                                                                            

Moviepy - Done !
Moviepy - video ready insta/in/output_video_with_blank_space.mp4


# Overlay text tactics

In [168]:
from moviepy.editor import VideoFileClip, ImageClip, CompositeVideoClip
from PIL import Image, ImageDraw, ImageFont

def wrap_text(text, font, max_width):
    """
    Wrap text into multiple lines so that it fits within the specified width.
    Parameters:
    - text: str - the text to wrap
    - font: PIL.ImageFont - the font object used to measure text width
    - max_width: int - the maximum width in pixels for each line of text
    
    Returns:
    - wrapped_text: str - the input text with line breaks added
    """
    words = text.split(' ')
    lines = []
    current_line = ""

    for word in words:
        # Check the width of the current line + the next word
        line_width = font.getbbox(current_line + " " + word)[2] if current_line else font.getbbox(word)[2]
        if line_width <= max_width:
            current_line += " " + word if current_line else word
        else:
            lines.append(current_line)
            current_line = word

    if current_line:
        lines.append(current_line)

    return "\n".join(lines)

def generate_text_image(text, size=(500, 100), font_size=24, color='white'):
    """ Generates an image with the given text using PIL, ensuring text wraps within the given width. """
    # Create a font object
    font = ImageFont.truetype("~/Library/Fonts/NotoSans-Regular.ttf", font_size)  # Replace with correct path to font if necessary
    
    # Wrap the text so it fits within the width
    wrapped_text = wrap_text(text, font, size[0])  # size[0] is the width of the image
    
    # Estimate height of the image based on number of lines
    lines = wrapped_text.split('\n')
    text_height = font.getbbox(wrapped_text)[3] * len(lines)  # Estimate height based on font size and line count
    
    # Create the image large enough to fit the text
    img = Image.new('RGB', (size[0], text_height + 20), color='black')  # Add padding
    
    # Draw the text
    d = ImageDraw.Draw(img)
    d.text((10, 10), wrapped_text, fill=color, font=font)
    
    # Save the image temporarily
    img.save("temp_text_image.png")
    
    return "temp_text_image.png"

def add_text_to_video(video_path, text_data, output_path):
    """
    Parameters:
    - video_path: str - path to the input video file
    - text_data: list of tuples [(timestamp1, text1), (timestamp2, text2), ...]
    - output_path: str - path to save the output video file
    """
    # Load the video
    video = VideoFileClip(video_path)
    
    # List to hold clips
    clips = [video]

    # Iterate over text data and add text at the specified timestamps
    for timestamp, text in text_data:
        # Generate text image using PIL
        text_img_path = generate_text_image(text)
        
        # Create an ImageClip
        txt_clip = (ImageClip(text_img_path)
                    .set_duration(5)  # Text display duration (5 seconds in this case)
                    .set_start(timestamp))  # Start time for text display

        # Add text clip to the list of clips
        clips.append(txt_clip.set_position((video.w - 500, 'center')))  # Align text in the blank space

    # Composite the video and text clips together
    final_video = CompositeVideoClip(clips, size=video.size)

    # Add original audio back to the final video
    final_video = final_video.set_audio(video.audio)

    # Write the final video to a file
    final_video.write_videofile(output_path, codec='libx264', audio_codec='aac')


In [171]:
video_path = "insta/in/output_video_with_blank_space.mp4"
output_path = "insta/out/output_video_with_text.mp4"

text_data = [(x["start"], "/!\ " + x["tactic"]+": "+x["explanation"]) for x in parsed_res[0]]

In [172]:
add_text_to_video(video_path, text_data, output_path)

Moviepy - Building video insta/out/output_video_with_text.mp4.
MoviePy - Writing audio in output_video_with_textTEMP_MPY_wvf_snd.mp4


                                                                                                                            

MoviePy - Done.
Moviepy - Writing video insta/out/output_video_with_text.mp4



                                                                                                                            

Moviepy - Done !
Moviepy - video ready insta/out/output_video_with_text.mp4


# WITH DIFF FONTS AND SYMBOLS

In [152]:
from moviepy.editor import VideoFileClip, ImageClip, CompositeVideoClip
from PIL import Image, ImageDraw, ImageFont

def generate_text_image_with_styles(text_parts, font_paths, size=(500, 200), font_size=24, color='white'):
    """
    Generates an image with styled text (bold, italic) and highlights parts of the text.
    - text_parts: list of tuples [(text, style)] - style can be 'bold', 'italic', 'highlight', 'normal'
    - font_paths: dict with 'normal', 'bold', 'italic' paths to font files
    - size: (width, height) of the image
    - font_size: size of the font
    - color: text color (default: white)
    """
    img = Image.new('RGB', size, color='black')
    d = ImageDraw.Draw(img)
    
    # Set initial x, y position
    x, y = 10, 10

    for part, style in text_parts:
        if style == "bold":
            font = ImageFont.truetype(font_paths['bold'], font_size)
        elif style == "italic":
            font = ImageFont.truetype(font_paths['italic'], font_size)
        else:
            font = ImageFont.truetype(font_paths['normal'], font_size)

        if style == "highlight":
            # Draw a rectangle for highlight (slightly bigger than text)
            text_bbox = d.textbbox((x, y), part, font=font)  # Get bounding box for text
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]
            d.rectangle([(x - 2, y - 2), (x + text_width + 2, y + text_height + 2)], fill="yellow")

        # Draw the text
        d.text((x, y), part, font=font, fill=color)
        
        # Update x position for the next part of the text
        text_bbox = d.textbbox((x, y), part, font=font)
        text_width = text_bbox[2] - text_bbox[0]
        x += text_width + 5  # Add some padding between words
    
    # Save the image temporarily
    img.save("temp_text_image.png")
    return "temp_text_image.png"

def add_text_to_video_with_styles_and_audio(video_path, text_data, output_path):
    """
    Adds styled text (bold, italic, highlighted, emoji) to video and preserves the original audio.
    - video_path: str - path to the input video file
    - text_data: list of tuples [(timestamp1, text1_parts), (timestamp2, text2_parts), ...]
    - output_path: str - path to save the output video file
    """
    # Load the video
    video = VideoFileClip(video_path)
    
    # Font paths (replace these paths with actual paths to your font files)
    font_paths = {
        'normal': 'NotoSans-Regular.ttf',
        'bold': 'NotoSans-Bold.ttf',
        'italic': 'NotoSans-Italic.ttf',
        'emoji': 'NotoColorEmoji-Regular.ttf'
    }
    
    # List to hold clips
    clips = [video]

    # Iterate over text data and add text at the specified timestamps
    for timestamp, text_parts in text_data:
        # Generate text image with styles (bold, italic, highlights, etc.)
        text_img_path = generate_text_image_with_styles(text_parts, font_paths)
        
        # Create an ImageClip
        txt_clip = (ImageClip(text_img_path)
                    .set_duration(5)  # Text display duration (5 seconds in this case)
                    .set_start(timestamp)
                    .set_position((video.w - 500, 'center')))  # Align text in the blank space

        # Add text clip to the list of clips
        clips.append(txt_clip)

    # Composite the video and text clips together
    final_video = CompositeVideoClip(clips, size=video.size)

    # Add original audio back to the final video
    final_video = final_video.set_audio(video.audio)

    # Write the final video to a file
    final_video.write_videofile(output_path, codec='libx264', audio_codec='aac')

In [163]:
# Define text with parts having different styles
# Use tuple (text, style) where style can be 'bold', 'italic', 'highlight', 'normal'
text_data = [
    (3, [("⚠️", 'normal')]),
    (10, [("Another text here, normal.", 'normal')]),
    (15, [("Final text example.", 'bold')])
]

In [166]:
video_path = "insta/in/output_video_with_blank_space.mp4"
output_path = "insta/out/output_video_with_text.mp4"

text_data = [(x["start"], [("⚠️ " + x["tactic"]+": "+x["explanation"], "bold")]) for x in parsed_res[0]]

In [167]:
add_text_to_video_with_styles_and_audio(video_path, text_data, output_path)

Moviepy - Building video insta/out/output_video_with_text.mp4.
MoviePy - Writing audio in output_video_with_textTEMP_MPY_wvf_snd.mp4


                                                                                                                            

MoviePy - Done.
Moviepy - Writing video insta/out/output_video_with_text.mp4



                                                                                                                            

Moviepy - Done !
Moviepy - video ready insta/out/output_video_with_text.mp4


# Insta clip generation

In [None]:
!pip install mediapipe==0.10.15

In [None]:
import cv2
import mediapipe as mp

In [None]:
mp_face_detection = mp.solutions.face_detection
mp_drawing = mp.solutions.drawing_utils

# For static images:
IMAGE_FILES = ["insta/in/test.png"]
with mp_face_detection.FaceDetection(
    model_selection=1, min_detection_confidence=0.5) as face_detection:
  for idx, file in enumerate(IMAGE_FILES):
    image = cv2.imread(file)
    # Convert the BGR image to RGB and process it with MediaPipe Face Detection.
    results = face_detection.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    # Draw face detections of each face.
    if not results.detections:
      continue
    annotated_image = image.copy()
    for detection in results.detections:
      print('Nose tip:')
      print(mp_face_detection.get_key_point(
          detection, mp_face_detection.FaceKeyPoint.NOSE_TIP))
      mp_drawing.draw_detection(annotated_image, detection)
    cv2.imwrite('./insta/out/annotated_image' + str(idx) + '.png', annotated_image)

In [None]:
detection.location_data.relative_bounding_box

In [None]:
from PIL import Image, ImageDraw, ImageFont

# Load your PNG image
image = Image.open('insta/out/annotated_image0.png')

# Initialize the drawing context with the image as background
draw = ImageDraw.Draw(image)

# Load a font that supports special characters (Arial or DejaVu Sans)
# Use the appropriate path for the font file on your system
#font_path = "~/Library/Fonts/Poppins-ExtraLight.ttf"  # Change to your actual font path
#font_path = "~/Library/Fonts/Supplemental/Arial.ttf"
font_path = "~/Library/Fonts/Symbola.ttf"

font = ImageFont.truetype(font_path, size=20)

# Define the text and position (x, y) coordinates
text = """
⚠️ **Framing**: The way the situation is described (e.g., "climat délétère," "menaces de mort," "radicalisation")
frames the issue in a highly negative light, which can influence the audience's perception of the pro-Palestinian
movement and the overall atmosphere at EPFL.
"""

position = (50, 100)  # (x, y) coordinates
color = (245,35,35)  # White color (R, G, B format)

# Add text to the image
draw.rectangle(((40, 100), (1050, 200)), outline="#f52323", fill="white")
draw.text((50, 100), text, font=ImageFont.truetype(font_path, 20), fill="#f52323")
#draw.text(position, text, font=font, fill=color)

# Save the modified image to a new file
image.save('insta/out/image_with_text_pil.png')

# TO DO: auto position text based on detected face bounding box
# position text left/right of face based on rule
# multiline text with bold/italic fonts
# remove facial detection points
# emotion detection

# Scrap video captions

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver

In [None]:
driver = webdriver.Firefox()

In [None]:
driver.get("https://www.rts.ch/play/tv/19h30/video/19h30?urn=urn:rts:video:14926430")

In [None]:
soup = BeautifulSoup(driver.page_source)

In [None]:
clips = soup.find_all("p", {"class": "subdivision__description"})
clip_durations = soup.find_all("div", {"class": "subdivision__duration"})
topic_timestamps = {c.text:d.text for c,d in zip(clips, clip_durations)}
topic_timestamps

In [None]:
from numpy import cumsum

def time_to_seconds(time_str):
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds
    
topic_timestamps_s = [time_to_seconds(t) for t in topic_timestamps.values()]
topic_timestamps_s = cumsum(topic_timestamps_s)

for key, new_value in zip(topic_timestamps.keys(), topic_timestamps_s):
    topic_timestamps[key] = int(new_value)

topic_timestamps

# Legacy

In [None]:
!pip install ffmpeg-python

In [None]:
import os
import requests
import ffmpeg

# Set the base folder where your files are stored
base_folder = 'video'

# Function to download M4S segments
def download_segments(m3u8_url, output_file, base_url):
    segments = []
    with open(os.path.join(base_folder, m3u8_url), 'r') as m3u8_file:
        for line in m3u8_file:
            if line.startswith('#') or not line.strip():
                continue
            segments.append(line.strip())
    
    with open(os.path.join(base_folder, output_file), 'wb') as out_file:
        for segment in segments:
            segment_url = os.path.join(base_url, segment)
            response = requests.get(segment_url)
            out_file.write(response.content)

In [None]:
# Download the video and audio segments
video_m3u8 = 'index-f3-v1.m3u8'  # Adjust this to the appropriate video stream
audio_m3u8 = 'index-f1-a1.m3u8'  # Adjust this to the appropriate audio stream
base_url = 'https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/'  # Base URL for the segments

download_segments(video_m3u8, 'video.ts', base_url)
download_segments(audio_m3u8, 'audio.ts', base_url)

# Merge the video and audio using ffmpeg
output_file = 'output.mp4'
ffmpeg.input(os.path.join(base_folder, 'video.ts')).output(os.path.join(base_folder, 'audio.ts')).output(output_file).run()

print(f'Merged video and audio into {output_file}')

In [None]:
import os

# Folder containing the video segments
#folder_path = 'video/video_segments'
folder_path = 'video/audio_segments'

# Check if the folder exists
if not os.path.exists(folder_path):
    print(f"Folder {folder_path} does not exist.")
else:
    # Rename files from "segment_{i}.m4s" to "seg-{i}-f3-v1.m4s"
    for filename in os.listdir(folder_path):
        if filename.startswith('segment_') and filename.endswith('.m4s'):
            # Extract the segment number from the filename
            segment_number = filename.split('_')[1].split('.')[0]
            # Create new filename
            new_filename = f'seg-{segment_number}-f3-v1.m4s'
            # Full paths
            old_file_path = os.path.join(folder_path, filename)
            new_file_path = os.path.join(folder_path, new_filename)
            # Rename the file
            os.rename(old_file_path, new_file_path)
            print(f"Renamed {filename} to {new_filename}")

    print("File renaming completed.")


In [None]:
import os

# Folder containing the video segments
folder_path = 'video/audio_segments'

# Check if the folder exists
if not os.path.exists(folder_path):
    print(f"Folder {folder_path} does not exist.")
else:
    # Rename files from "seg-{i}-f3-v1.m4s" to "seg-{i}-f3-a1.m4s"
    for filename in os.listdir(folder_path):
        if filename.startswith('seg-') and '-f3-v1.m4s' in filename:
            # Create new filename by replacing 'v1' with 'a1'
            new_filename = filename.replace('-f3-v1.m4s', '-f3-a1.m4s')
            # Full paths
            old_file_path = os.path.join(folder_path, filename)
            new_file_path = os.path.join(folder_path, new_filename)
            # Rename the file
            os.rename(old_file_path, new_file_path)
            print(f"Renamed {filename} to {new_filename}")

    print("File renaming completed.")


In [None]:
import os
import subprocess

# Define paths to the segments and output file
base_folder = 'video/video_segments'
video_segments = [os.path.join(base_folder, f'seg-{i}-f3-v1.m4s') for i in range(232)]  # Adjust number of segments
audio_segments = [os.path.join('video/audio_segments', f'seg-{i}-f3-a1.m4s') for i in range(232)]  # Adjust number of segments
output_file = 'output.mp4'

# Concatenate video segments into one file
with open(os.path.join(base_folder, 'video_concat.ts'), 'wb') as video_concat:
    for video_segment in video_segments:
        with open(video_segment, 'rb') as vf:
            video_concat.write(vf.read())

# Concatenate audio segments into one file
with open(os.path.join(base_folder, 'audio_concat.ts'), 'wb') as audio_concat:
    for audio_segment in audio_segments:
        with open(audio_segment, 'rb') as af:
            audio_concat.write(af.read())

# Merge video and audio using ffmpeg
subprocess.run([
    'ffmpeg', '-i', os.path.join(base_folder, 'video_concat.ts'),
    '-i', os.path.join(base_folder, 'audio_concat.ts'),
    '-c:v', 'copy', '-c:a', 'copy', output_file
])

print(f'Merged video and audio into {output_file}')


# Download m4s files

In [None]:
# URL of the .m4s file
url = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/seg-1-f3-v1.m4s"
#url = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/seg-1-f1-a1.m4s"

# File name where the downloaded file will be saved
output_file = "../data/assets/19h30/video/24_05_2024/seg-1-f3-v1.m4s"
#output_file = "../data/assets/19h30/video/24_05_2024/seg-1-f1-a1.m4s"

# Send a GET request to the URL to download the file
response = requests.get(url, stream=True)

# Check if the request was successful
if response.status_code == 200:
    # Open the output file in write-binary mode
    with open(output_file, "wb") as file:
        # Write the content to the file in chunks to avoid loading the entire file into memory
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
    print(f"Downloaded: {output_file}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")


# Concatenate files

In [None]:
import os
import requests
import subprocess

# DOWNLOAD M4S SEGMENTS

In [None]:
import os
import requests
from urllib.parse import urljoin

# Function to download .ts or .m4s segments from a .m3u8 file
def download_segments(m3u8_url, output_folder):
    # Extract the base URL from the .m3u8 URL
    base_url = m3u8_url.rsplit('/', 1)[0] + '/'

    response = requests.get(m3u8_url)
    playlist = response.text.splitlines()

    segment_urls = [line for line in playlist if not line.startswith("#")]
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    segment_files = []
    
    for i, segment_url in enumerate(segment_urls):
        # Handle relative URLs by joining them with the base URL
        if not segment_url.startswith("http"):
            segment_url = urljoin(base_url, segment_url)
        
        segment_file = os.path.join(output_folder, f"segment_{i}.m4s")
        segment_files.append(segment_file)
        
        # Download the segment
        segment_data = requests.get(segment_url).content
        with open(segment_file, "wb") as f:
            f.write(segment_data)
        
        print(f"Downloaded segment {i + 1}/{len(segment_urls)}")
    
    return segment_files


In [None]:
# VIDEO
video_m3u8_url = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/index-f3-v1.m3u8"  # Adjust this to your actual .m3u8 URL
output_folder = "../data/assets/19h30/video/24_05_2024/video_segments"

download_segments(video_m3u8_url, output_folder)

In [None]:
# AUDIO
video_m3u8_url = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/index-f1-a1.m3u8"  # Adjust this to your actual .m3u8 URL
output_folder = "../data/assets/19h30/video/24_05_2024/audio_segments"

download_segments(video_m3u8_url, output_folder)

# CONCATENATE SEGMENTS

In [None]:
import os
import subprocess

# Path to the directory containing video segments
video_segments_folder = "../data/assets/19h30/video/24_05_2024/video_segments"
audio_segments_folder = "../data/assets/19h30/video/24_05_2024/audio_segments"  # Update if audio is in a separate folder
output_video_file = "output_video.mp4"
final_output_file = "final_output.mp4"

# Step 1: Create a file listing all the video segments to concatenate
def create_segment_list_file(segment_folder, segment_prefix, output_file):
    segment_files = sorted([os.path.join(segment_folder, f) for f in os.listdir(segment_folder) if f.startswith(segment_prefix) and f.endswith('.m4s')])
    
    with open(output_file, "w") as f:
        for segment_file in segment_files:
            f.write(f"file '{segment_file}'\n")
    
    return output_file

# Step 2: Concatenate video segments into a single file
def concatenate_video_segments(segment_list_file, output_file):
    ffmpeg_concat_command = f"ffmpeg -f concat -safe 0 -i {segment_list_file} -c copy {output_file}"
    subprocess.run(ffmpeg_concat_command, shell=True)
    print(f"Video segments concatenated into: {output_file}")

# Step 3: Combine the concatenated video file with audio (if available)
def combine_video_audio(video_file, audio_file, output_file):
    ffmpeg_combine_command = f"ffmpeg -i {video_file} -i {audio_file} -c copy {output_file}"
    subprocess.run(ffmpeg_combine_command, shell=True)
    print(f"Video and audio combined into: {output_file}")

In [None]:
# Create the file listing the video segments
video_list_file = "../data/assets/19h30/video/24_05_2024/video_segment_list.txt"
create_segment_list_file(video_segments_folder, "segment_", video_list_file)

In [None]:
# reorder segments
import os
import re

# Folder containing the .m4s files
video_segments_folder = "../data/assets/19h30/video/24_05_2024/video_segments"
output_list_file = "../data/assets/19h30/video/24_05_2024/video_segment_list.txt"

# Function to extract the numeric index from the filename
def extract_number(filename):
    match = re.search(r'(\d+)', filename)
    return int(match.group(0)) if match else -1  # Return -1 if no number is found (unlikely in your case)

# Step 1: List and sort the .m4s files by their numeric index
segment_files = sorted([f for f in os.listdir(video_segments_folder) if f.endswith(".m4s")], key=extract_number)

# Step 2: Create a list file for FFmpeg with the correctly ordered .m4s files
with open(output_list_file, "w") as f:
    for segment_file in segment_files:
        segment_path = os.path.join(video_segments_folder, segment_file)
        f.write(f"file '{segment_path}'\n")

print(f"Created {output_list_file} with numerically ordered segments.")


In [None]:
output_list_file = "video_segment_list.txt"
output_list_file

In [None]:
output_video_file

In [None]:
# Concatenate the video segments
concatenate_video_segments(output_list_file, output_video_file)

In [None]:
# Combine video with audio if audio segments are available
# You may replace the audio file if it's a different format or path
audio_file = "path_to_audio_file.m4s"  # Adjust this to the actual audio file or segment
combine_video_audio(output_video_file, audio_file, final_output_file)

In [None]:
# Function to concatenate the downloaded .ts files
def concatenate_segments(segment_files, output_file):
    with open("file_list.txt", "w") as file_list:
        for segment in segment_files:
            file_list.write(f"file '{segment}'\n")
    
    # Use ffmpeg to concatenate the files
    ffmpeg_command = f"ffmpeg -f concat -safe 0 -i file_list.txt -c copy {output_file}"
    subprocess.run(ffmpeg_command, shell=True)

In [None]:
# Paths to your video and audio .m3u8 URLs
video_m3u8_url = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/seg-1-f3-v1.m4s"  # Replace with your URL
audio_m3u8_url = "https://rts-vod-amd.akamaized.net/ww/14926430/2fd70dc2-e142-3071-bc7a-a4d0ecdd7765/seg-1-f1-a1.m4s"  # Replace with your URL

In [None]:
# Output folders for video and audio segments
video_output_folder = "video_segments"
audio_output_folder = "audio_segments"

# Download video and audio segments
video_segment_files = download_segments(video_m3u8_url, video_output_folder)
audio_segment_files = download_segments(audio_m3u8_url, audio_output_folder)

In [None]:
# Concatenate video and audio files into one final file each
concatenated_video = "concatenated_video.ts"
concatenated_audio = "concatenated_audio.ts"

concatenate_segments(video_segment_files, concatenated_video)
concatenate_segments(audio_segment_files, concatenated_audio)

# Step 3: Combine video and audio into a single MP4 file using ffmpeg
final_output = "output.mp4"
ffmpeg_command = f"ffmpeg -i {concatenated_video} -i {concatenated_audio} -c copy {final_output}"
subprocess.run(ffmpeg_command, shell=True)

print(f"Video saved as {final_output}")

In [None]:
import os

# Path to the directory containing the .m4s files
video_segments_folder = "../data/assets/19h30/video/24_05_2024/video_segments"
output_file = "all.m4s"

# Remove output file if it exists
if os.path.exists(output_file):
    os.remove(output_file)

# Open the output file in binary append mode
with open(output_file, "ab") as outfile:
    # Sort and concatenate all .m4s files
    for segment_file in sorted(os.listdir(video_segments_folder)):
        if segment_file.endswith(".m4s"):
            segment_path = os.path.join(video_segments_folder, segment_file)
            with open(segment_path, "rb") as infile:
                outfile.write(infile.read())
            print(f"Appended {segment_file}")

print(f"Concatenation complete. Output is in {output_file}")


# Automate m4s scraping

In [None]:
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.devtools.v111 import devtools
from selenium.webdriver.common.devtools.v111.network import Network

In [None]:
# Initialize the Chrome driver
driver = webdriver.Firefox()

In [None]:

# Enable Network domain
driver.execute(devtools.DevTools.send(Network.enable()))

# Navigate to the page
url = 'https://www.rts.ch/play/tv/19h30/video/19h30?urn=urn:rts:video:15072997'
driver.get(url)

# Wait for some time to let the page load completely
driver.implicitly_wait(5)

# Retrieve network logs
logs = driver.execute(devtools.DevTools.send(Network.getResponseBody))

# Extract network logs and save them as a HAR file
har_entries = []

for log in logs:
    log_data = json.loads(log['message'])['message']
    if log_data['method'] == 'Network.requestWillBeSent':
        har_entries.append(log_data)

# Save HAR data to a file
har_data = {
    "log": {
        "version": "1.2",
        "creator": {
            "name": "Selenium",
            "version": "3.141.0"
        },
        "entries": har_entries
    }
}

In [None]:
with open('network.har', 'w') as har_file:
    json.dump(har_data, har_file, indent=4)

# Close the browser
driver.quit()

print("HAR file saved as network.har")

In [None]:
from pydub import AudioSegment

In [None]:
# Load the .m4s file
audio = AudioSegment.from_file("../data/assets/segment.m4s", format="m4s")

In [None]:
# Export to .mp4
audio.export("../data/assets/output.mp4", format="mp4")