The purpose of the following code was to use the video urls, that were fetched in notebook 1 (Scraping the relevant YouTube URLs) to scrape their metadata. Parts of this code were generated with the help of ChatGPT and altered for the specific needs of this study. The main tool used to scrape the links was YouTube Data API v3.

In [None]:
!pip install isodate

Collecting isodate
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate
Successfully installed isodate-0.7.2


In [None]:
api_key = "api_key" # again the real api key was replaced

with open("DW_URLs.txt", "r") as f:
    video_urls_list = [line.strip() for line in f]

print(f"{len(video_urls_list)} URLs loaded")
print(video_urls_list[:5])

4302 URLs loaded
['https://www.youtube.com/watch?v=wLl0QLyZ-Ec', 'https://www.youtube.com/watch?v=5yj3k1xIB68', 'https://www.youtube.com/watch?v=yqqCS-U5QqY', 'https://www.youtube.com/watch?v=eWedRFWb7To', 'https://www.youtube.com/watch?v=9Pp4uXFQKH4']


In [None]:
import pandas as pd
from googleapiclient.discovery import build
from urllib.parse import urlparse, parse_qs
import isodate

def extract_video_id(url):
    parsed = urlparse(url)
    if parsed.hostname == "youtu.be":
        return parsed.path[1:]
    elif parsed.hostname in ["www.youtube.com", "youtube.com"]:
        params = parse_qs(parsed.query)
        return params.get("v", [None])[0]
    return None

def scrape_video_metadata(api_key, video_urls):
    youtube = build("youtube", "v3", developerKey=api_key)

    # Extract and clean video IDs
    video_ids = list(filter(None, [extract_video_id(url) for url in video_urls]))
    if not video_ids:
        raise ValueError("No valid video IDs found.")

    # Process in batches of 50 due to API restrictions
    all_data = []
    for i in range(0, len(video_ids), 50):
        batch_ids = video_ids[i:i + 50]
        request = youtube.videos().list(
            part="snippet,statistics,contentDetails",
            id=",".join(batch_ids),
            maxResults=50
        )
        response = request.execute()

        for item in response.get("items", []):
            snippet = item["snippet"]
            stats = item.get("statistics", {})
            content_details = item.get("contentDetails", {})
            duration_iso = content_details.get("duration")

            # Parse duration to seconds
            duration_seconds = int(isodate.parse_duration(duration_iso).total_seconds()) if duration_iso else None

            video_data = {
                "video_id": item["id"],
                "video_url": f"https://www.youtube.com/watch?v={item['id']}",
                "title": snippet.get("title"),
                "description": snippet.get("description"),
                "published_at": snippet.get("publishedAt"),
                "view_count": int(stats.get("viewCount", 0)),
                "like_count": int(stats.get("likeCount", 0)),
                "comment_count": int(stats.get("commentCount", 0)),
                "video_length": duration_seconds
            }
            all_data.append(video_data)

    return pd.DataFrame(all_data)


In [None]:
DW_Video_data = scrape_video_metadata(api_key, video_urls_list)

In [None]:
print(len(DW_Video_data))
DW_Video_data.to_csv("DW_Video_data.csv", index=False)
from google.colab import files
files.download("DW_Video_data.csv")
DW_Video_data

4271


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,video_id,video_url,title,description,published_at,view_count,like_count,comment_count,video_length
0,wLl0QLyZ-Ec,https://www.youtube.com/watch?v=wLl0QLyZ-Ec,Ex-Mitarbeiter von AfD-Politiker Maximilian Kr...,Gegen einen ehemaligen Mitarbeiter des AfD-Pol...,2025-08-05T14:01:25Z,15479,237,299,1001
1,5yj3k1xIB68,https://www.youtube.com/watch?v=5yj3k1xIB68,Ukraine: Wie russische Drohnen die Versorgung ...,Verletzte ukrainische Soldaten sitzen oft im K...,2025-08-05T12:02:57Z,9598,153,60,201
2,yqqCS-U5QqY,https://www.youtube.com/watch?v=yqqCS-U5QqY,Wie Putin Schüler für den Drohnenkrieg benutzt...,Russland überzieht die Ukraine mit massiven Dr...,2025-08-04T16:34:59Z,24052,434,197,871
3,eWedRFWb7To,https://www.youtube.com/watch?v=eWedRFWb7To,Jared Kushner will in Albanien ein Luxus-Resso...,"Trumps Schwiegersohn, Jared Kushner, will in A...",2025-08-01T18:00:01Z,11699,186,43,404
4,9Pp4uXFQKH4,https://www.youtube.com/watch?v=9Pp4uXFQKH4,Iran zwingt Flüchtlinge zur Rückkehr nach Afgh...,"Feindseligkeit, steigende Preise und politisch...",2025-08-01T13:30:08Z,21734,250,165,183
...,...,...,...,...,...,...,...,...,...
4266,Ii7w7VNFo2s,https://www.youtube.com/watch?v=Ii7w7VNFo2s,Coronavirus: Deutsche in Quarantäne | DW Deutsch,Deutschland hat mit der Evakuierung seiner Sta...,2020-02-03T10:01:13Z,30046,254,151,290
4267,Qy9QQgqSQFg,https://www.youtube.com/watch?v=Qy9QQgqSQFg,Coronavirus in China: Deutsche werden aus Wuha...,Die Weltgesundheitsorganisation (WHO) hat ange...,2020-01-31T17:10:25Z,302611,1568,379,589
4268,ojfy9oanvOs,https://www.youtube.com/watch?v=ojfy9oanvOs,Coronavirus in China: Sebastian steckt in Wuha...,British Airways und Lufthansa - die großen eur...,2020-01-30T09:13:14Z,102134,663,323,752
4269,rrFCUxvDV6s,https://www.youtube.com/watch?v=rrFCUxvDV6s,Coronavirus in China: Deutscher Student berich...,Die Zahl der Todesopfer des Corona-Virus steig...,2020-01-29T12:59:34Z,186828,1179,549,649


In [None]:
with open("BILD_URLs.txt", "r") as f:
    BILD_urls_list = [line.strip() for line in f]

print(f"{len(BILD_urls_list)} URLs loaded")

BILD_Video_data = scrape_video_metadata(api_key, BILD_urls_list)
print(len(BILD_Video_data))
print(BILD_Video_data)
BILD_Video_data.to_csv("BILD_Video_data.csv", index=False)
from google.colab import files
files.download("BILD_Video_data.csv")

14229 URLs loaded
13884
          video_id                                    video_url  \
0      Jf7BcLknC1I  https://www.youtube.com/watch?v=Jf7BcLknC1I   
1      KFtEWl4s-sk  https://www.youtube.com/watch?v=KFtEWl4s-sk   
2      k5wLZkD9NZM  https://www.youtube.com/watch?v=k5wLZkD9NZM   
3      3j1EFKPSGoE  https://www.youtube.com/watch?v=3j1EFKPSGoE   
4      QkGZYUbNnGE  https://www.youtube.com/watch?v=QkGZYUbNnGE   
...            ...                                          ...   
13879  eJmhwxEZSI0  https://www.youtube.com/watch?v=eJmhwxEZSI0   
13880  f7AU1yQQXhs  https://www.youtube.com/watch?v=f7AU1yQQXhs   
13881  5CGR376LRr8  https://www.youtube.com/watch?v=5CGR376LRr8   
13882  xzOehwU17V0  https://www.youtube.com/watch?v=xzOehwU17V0   
13883  7RPL-IBo03Q  https://www.youtube.com/watch?v=7RPL-IBo03Q   

                                                   title  \
0      Leopard und Puma im Einsatz: Bundeswehr probt ...   
1      Jagd auf jüdische Passagiere: Islamisten-Mob

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Filter only relevant years
def filter_years(df):
    df["published_at"] = pd.to_datetime(df["published_at"], errors="coerce")
    return df[df["published_at"].dt.year.isin([2020, 2021, 2022, 2023])]


BILD_video_data_cleaned = filter_years(BILD_Video_data)
DW_video_data_cleaned = filter_years(DW_Video_data)


In [None]:
BILD_video_data_cleaned.to_csv("BILD_video_data_cleaned.csv", index=False)
DW_video_data_cleaned.to_csv("DW_video_data_cleaned.csv", index=False)

files.download("BILD_video_data_cleaned.csv")
files.download("DW_video_data_cleaned.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>