# Clean youtube/youtube music activity

In [1]:
import glob
import os

import pandas as pd
import pytz

data_file = "/home/haaksk/Downloads/takeout-20260105T143128Z-3-001/Takeout/My Activity/YouTube/MyActivity.json"

In [2]:
df = pd.read_json(data_file)
oslo_tz = pytz.timezone("Europe/Oslo")
df["time_local"] = pd.to_datetime(df["time"], format="ISO8601", utc=True).dt.tz_convert(oslo_tz)

df["year"] = df["time_local"].dt.year
df["month"] = df["time_local"].dt.month
df["day"] = df["time_local"].dt.day
df["hour"] = df["time_local"].dt.hour

print(df.activityControls.value_counts())
print("")
print(df.header.value_counts())
print("")
print(df.columns)

# extract the first word of the title
print(df.title.str.extract("^(\\w+)").value_counts())

activityControls
[YouTube watch history]                                                69891
[YouTube search history]                                                8275
[Web & App Activity, YouTube watch history]                                9
[Web & App Activity, YouTube watch history, YouTube search history]        1
Name: count, dtype: int64

header
YouTube          48864
YouTube Music    30563
Name: count, dtype: int64

Index(['header', 'title', 'titleUrl', 'subtitles', 'time', 'products',
       'activityControls', 'details', 'time_local', 'year', 'month', 'day',
       'hour'],
      dtype='object')
0         
Watched       67353
Searched       8275
Viewed         2267
Liked           843
Subscribed      317
Voted           171
Dismissed       163
Saved            21
Answered          7
Disliked          6
Joined            3
Visited           1
Name: count, dtype: int64


In [3]:
df_sub = df[df.title.str.startswith("Subscribed")].copy()
df = df[df.title.str.startswith("Watched")]
print(df.shape)

(67353, 13)


In [4]:
df['title'] = df['title'].str.replace(r'^Watched ', '', regex=True)

df = df.drop(columns=["activityControls", "products", "details"], errors="ignore")
df.head(2)

Unnamed: 0,header,title,titleUrl,subtitles,time,time_local,year,month,day,hour
0,YouTube Music,Sunset,https://music.youtube.com/watch?v=kuE-5p7nxxk,"[{'name': 'The Midnight - Topic', 'url': 'http...",2026-01-05T14:19:28.863Z,2026-01-05 15:19:28.863000+01:00,2026,1,5,15
1,YouTube Music,Above the Sky,https://music.youtube.com/watch?v=c_C_1S5s3yQ,"[{'name': 'Majestica - Topic', 'url': 'https:/...",2026-01-05T14:13:37.581Z,2026-01-05 15:13:37.581000+01:00,2026,1,5,15


In [5]:
df_ytm = df[df["header"] == "YouTube Music"].copy()
print(df_ytm.shape)
df_yt = df[df["header"] != "YouTube Music"].copy()
print(df_yt.shape)

(28561, 10)
(38792, 10)


In [6]:
df_ytm.rename(columns={"title":"song"})

def extract_artist(subtitles):
    if pd.isna(subtitles) or not subtitles:
        return None
    artist_full = subtitles[0].get("name", None) if len(subtitles) > 0 else None
    if artist_full and " - " in artist_full:
        return artist_full.rsplit(" - ", 1)[0]
    return artist_full


df_ytm["artist"] = df_ytm["subtitles"].apply(extract_artist)
df_ytm = df_ytm.drop(columns=["titleUrl", "subtitles", "header", "time"], errors="ignore")

print(df_ytm.shape)
df_ytm.head()

(28561, 7)


Unnamed: 0,title,time_local,year,month,day,hour,artist
0,Sunset,2026-01-05 15:19:28.863000+01:00,2026,1,5,15,The Midnight
1,Above the Sky,2026-01-05 15:13:37.581000+01:00,2026,1,5,15,Majestica
2,Ordinary Story,2026-01-05 15:09:21.283000+01:00,2026,1,5,15,In Flames
3,All I Need (with Mahalia & Ty Dolla $ign),2026-01-05 15:05:17.536000+01:00,2026,1,5,15,Jacob Collier
4,Alive,2026-01-05 15:01:52.982000+01:00,2026,1,5,15,Empire of The Sun


# Spotify data

In [7]:
spotify_data_folder = "~/Downloads/MyData/"

In [8]:
spotify_files = glob.glob(os.path.expanduser(spotify_data_folder) + "endsong_*.json")
dfs = []

for file_path in sorted(spotify_files):
    df_temp = pd.read_json(file_path)
    dfs.append(df_temp)

df_s = pd.concat(dfs, ignore_index=True)

In [9]:
df_s["time_local"] = pd.to_datetime(df_s["ts"], format="ISO8601", utc=True).dt.tz_convert(oslo_tz)

df_s["year"] = df_s["time_local"].dt.year
df_s["month"] = df_s["time_local"].dt.month
df_s["day"] = df_s["time_local"].dt.day
df_s["hour"] = df_s["time_local"].dt.hour
df_s = df_s.rename(
    columns={
        "master_metadata_track_name": "song",
        "master_metadata_album_artist_name": "artist",
    }
)

df_s = df_s[["time_local", "year", "month", "day", "hour", "song", "artist"]]

In [10]:
df_s

Unnamed: 0,time_local,year,month,day,hour,song,artist
0,2010-04-29 22:10:28+02:00,2010,4,29,22,Countdown [Designer Drugs Remix],Jupiter One
1,2010-04-29 22:17:10+02:00,2010,4,29,22,These Woods Breathe Evil,Swallow The Sun
2,2010-04-29 22:20:46+02:00,2010,4,29,22,Falling World,Swallow The Sun
3,2010-04-29 22:20:47+02:00,2010,4,29,22,...and Heavens Cried Blood,Swallow The Sun
4,2010-04-29 22:20:48+02:00,2010,4,29,22,Sleepless Swans,Swallow The Sun
...,...,...,...,...,...,...,...
281068,2018-03-13 13:25:43+01:00,2018,3,13,13,III Ways to Epica,Kamelot
281069,2018-03-13 13:28:23+01:00,2018,3,13,13,Sea Of Machines,Ayreon
281070,2018-03-13 13:30:55+01:00,2018,3,13,13,One Night Stand,Vidar Villa
281071,2018-03-13 13:31:57+01:00,2018,3,13,13,Herregud,Vidar Villa


# Final merge of spotify and youtube music data

In [11]:
print(df_s.shape)
print(df_ytm.shape)

df_tot = pd.concat([df_s, df_ytm])

df_tot.to_parquet("~/Workspace/EchoVault/data/music_history.parquet")

(281073, 7)
(28561, 7)


# Youtube subscription

In [12]:
print(df_sub.shape)
df_sub.head()

(317, 13)


Unnamed: 0,header,title,titleUrl,subtitles,time,products,activityControls,details,time_local,year,month,day,hour
2524,YouTube,Subscribed to Just Alex,https://www.youtube.com/channel/UCQgL66VBbwihX...,,2025-11-30T12:08:58.820Z,[YouTube],,,2025-11-30 13:08:58.820000+01:00,2025,11,30,13
2565,YouTube,Subscribed to ArjanCodes,https://www.youtube.com/channel/UCVhQ2NnY5Rskt...,,2025-11-29T20:06:06.459Z,[YouTube],,,2025-11-29 21:06:06.459000+01:00,2025,11,29,21
2636,YouTube,Subscribed to Christian Lempa,https://www.youtube.com/channel/UCZNhwA1B5YqiY...,,2025-11-28T21:29:01.017Z,[YouTube],,,2025-11-28 22:29:01.017000+01:00,2025,11,28,22
4009,YouTube,Subscribed to Cannons,https://www.youtube.com/channel/UCq5ePOCJ5iYJE...,,2025-11-13T21:29:47.725Z,[YouTube],,,2025-11-13 22:29:47.725000+01:00,2025,11,13,22
4159,YouTube,Subscribed to GoatsInAPond,https://www.youtube.com/channel/UC5RR0wEGE1OLi...,,2025-11-11T09:54:07.417Z,[YouTube],,,2025-11-11 10:54:07.417000+01:00,2025,11,11,10


In [13]:
df_sub.title = df_sub.title.str.replace("^Subscribed to ", "", regex=True)
df_sub = df_sub.rename(
    columns={
        "title": "channel",
    }
)
df_sub = df_sub[["channel", "time_local", "year", "month", "day", "hour"]]
df_sub.head()

df_sub.to_parquet("~/Workspace/EchoVault/data/subscriptions.parquet")

# Youtube 

In [14]:
df_yt.head()

Unnamed: 0,header,title,titleUrl,subtitles,time,time_local,year,month,day,hour
16,YouTube,1990s carpentry work in the United States #han...,https://www.youtube.com/watch?v=p_j9gzmfKJ0,"[{'name': 'Mister DIY', 'url': 'https://www.yo...",2026-01-05T12:45:58.977Z,2026-01-05 13:45:58.977000+01:00,2026,1,5,13
17,YouTube,"THIS ONE IS PERSONAL - Redlands, California vs...",https://www.youtube.com/watch?v=iagjGsVliP0,"[{'name': 'Phil Gaimon', 'url': 'https://www.y...",2026-01-05T12:36:43.920Z,2026-01-05 13:36:43.920000+01:00,2026,1,5,13
18,YouTube,Ideapad Slim 5 15 inch gen 10 AMD,https://www.youtube.com/watch?v=4bxFD2l723k,"[{'name': 'Lenovo Latam', 'url': 'https://www....",2026-01-05T12:22:05.897Z,2026-01-05 13:22:05.897000+01:00,2026,1,5,13
19,YouTube,Intel's making BIG progress - Lenovo IdeaPad P...,https://www.youtube.com/watch?v=I0k-hY3jSoA,"[{'name': 'ShortCircuit', 'url': 'https://www....",2026-01-05T12:15:16.860Z,2026-01-05 13:15:16.860000+01:00,2026,1,5,13
20,YouTube,Lenovo IdeaPad Pro 5 - The BEST One so Far!,https://www.youtube.com/watch?v=kxNk4g7pxms,"[{'name': 'Matthew Moniz', 'url': 'https://www...",2026-01-05T12:15:10.186Z,2026-01-05 13:15:10.186000+01:00,2026,1,5,13


In [15]:
def extract_channel_name(subtitles):
    if pd.isna(subtitles) or not subtitles:
        return None

    return subtitles[0].get("name", None) if len(subtitles) > 0 else None


def extract_channel_id(subtitles):
    if pd.isna(subtitles) or not subtitles:
        return None

    channel_id_full = subtitles[0].get("url", None) if len(subtitles) > 0 else None
    return channel_id_full.rsplit("channel/", 1)[1]


df_yt["channel_name"] = df_yt["subtitles"].apply(extract_channel_name)
df_yt["channel_id"] = df_yt["subtitles"].apply(extract_channel_id)

df_yt = df_yt[["title", "time_local", "channel_name", "channel_id", "year", "month", "day"]]
df_yt.shape

(38792, 7)

In [16]:
df_yt.to_parquet("~/Workspace/EchoVault/data/youtube_history.parquet")

# Stats

In [17]:
df_tot.year.value_counts().sort_index()

year
2010     8958
2011     7736
2012    21937
2013    20567
2014    24948
2015    27403
2016    25406
2017    27289
2018    20577
2019    22379
2020    20547
2021    22729
2022    22862
2023     7754
2024    13180
2025    15091
2026      271
Name: count, dtype: int64

In [18]:
df_yt.year.value_counts().sort_index()

year
2020        7
2021      361
2022     6765
2023     9453
2024    12254
2025     9871
2026       81
Name: count, dtype: int64