In [156]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
from datetime import timedelta

In [157]:
with open("/content/watch-history.html", "r", encoding="utf-8") as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, "html.parser")

watched_videos = []

videos = soup.find_all("div", class_="outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp")

for video in videos:
    video_info = video.find("div", class_="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1")

    video_title_url = video_info.find("a")

    if video_title_url is not None:

      video_url = video_title_url["href"]
      video_title = video_title_url.text

      channel_info = video_info.find_all("a")[1] if len(video_info.find_all("a")) > 1 else None
      channel_name = channel_info.text if channel_info else None
      channel_url = channel_info["href"] if channel_info else None

      text_nodes = video_info.stripped_strings
      time_watched = list(text_nodes)[-1]

      ad_info = video.find("div", class_="content-cell mdl-cell mdl-cell--12-col mdl-typography--caption")
      is_ad = "From Google Ads" in ad_info.text

      watched_videos.append({
          "videoURL": video_url,
          "title": video_title,
          "channelName": channel_name,
          "channelURL": channel_url,
          "timeWatched": time_watched,
          "isAd": is_ad
      })

watch_history_df = pd.DataFrame(watched_videos)

In [158]:
unavailable_videos = watch_history_df[watch_history_df["channelName"].isnull() & ~watch_history_df["isAd"]].index
watch_history_df.drop(unavailable_videos, inplace=True)

In [159]:
watch_history_df["timeWatched"] = pd.to_datetime(watch_history_df["timeWatched"], format="%b %d, %Y, %I:%M:%S %p GMT%z")

In [160]:
print("Total number of ads watched:", watch_history_df["isAd"].sum())

Total number of ads watched: 2670


In [161]:
watch_history_df = watch_history_df[watch_history_df["timeWatched"].dt.month.between(7, 12) & ~watch_history_df["isAd"]]
watch_history_df = watch_history_df.drop(columns=["isAd"])
watch_history_df.reset_index(drop=True, inplace=True)
print(watch_history_df.shape)

(4096, 5)


In [162]:
watch_history_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4096 entries, 0 to 4095
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype                    
---  ------       --------------  -----                    
 0   videoURL     4096 non-null   object                   
 1   title        4096 non-null   object                   
 2   channelName  4096 non-null   object                   
 3   channelURL   4096 non-null   object                   
 4   timeWatched  4096 non-null   datetime64[ns, UTC+03:00]
dtypes: datetime64[ns, UTC+03:00](1), object(4)
memory usage: 160.1+ KB


In [163]:
watch_history_df["videoId"] = watch_history_df["videoURL"].apply(
    lambda x: x.split("v=")[1] if "v=" in x else None
)
watch_history_df = watch_history_df.dropna(subset=["videoId"])
watch_history_df.reset_index(drop=True, inplace=True)

API_KEY = "AIzaSyAOa8tcUD_QYyWwUYPHTY5AWuZBVe5JAy8"

def get_video_details(video_id):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics,contentDetails&id={video_id}&key={API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {video_id}: {response.status_code}, {response.text}")
        return None

for index, row in watch_history_df.iterrows():
    video_id = row["videoId"]

    video_details = get_video_details(video_id)

    if video_details and "items" in video_details and len(video_details["items"]) > 0:
      items = video_details["items"][0]

      watch_history_df.loc[index, "kind"] = items.get("kind")
      watch_history_df.loc[index, "publishedAt"] = items.get("snippet", {}).get("publishedAt")
      watch_history_df.loc[index, "tags"] = ",".join(items.get("snippet", {}).get("tags", []))
      watch_history_df.loc[index, "categoryId"] = items.get("snippet", {}).get("categoryId")
      watch_history_df.loc[index, "defaultAudioLanguage"] = items.get("snippet", {}).get("defaultAudioLanguage")
      watch_history_df.loc[index, "duration"] = items.get("contentDetails", {}).get("duration")
      watch_history_df.loc[index, "definition"] = items.get("contentDetails", {}).get("definition")
      watch_history_df.loc[index, "viewCount"] = items.get("statistics", {}).get("viewCount")
      watch_history_df.loc[index, "likeCount"] = items.get("statistics", {}).get("likeCount")

In [164]:
watch_history_df["publishedAt"] = pd.to_datetime(watch_history_df["publishedAt"], errors="coerce")
watch_history_df["tags"] = watch_history_df["tags"].replace("", None)

In [165]:
def duration_to_timedelta(duration):
    match = re.match(r"PT((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+)S)?", duration)

    if not match:
        return None

    parts = match.groupdict()
    hours = int(parts["hours"]) if parts["hours"] else 0
    minutes = int(parts["minutes"]) if parts["minutes"] else 0
    seconds = int(parts["seconds"]) if parts["seconds"] else 0

    return timedelta(hours=hours, minutes=minutes, seconds=seconds)

watch_history_df["duration"] = watch_history_df["duration"].apply(duration_to_timedelta)

In [166]:
def get_video_categories():
    url = f"https://www.googleapis.com/youtube/v3/videoCategories?part=snippet&regionCode=US&key={API_KEY}"
    response = requests.get(url)
    categories = response.json()
    return {category["id"]: category["snippet"]["title"] for category in categories["items"]}

video_categories = get_video_categories()
watch_history_df["categoryName"] = watch_history_df["categoryId"].map(video_categories)

In [167]:
watch_history_df

Unnamed: 0,videoURL,title,channelName,channelURL,timeWatched,videoId,kind,publishedAt,tags,categoryId,defaultAudioLanguage,duration,definition,viewCount,likeCount,categoryName
0,https://www.youtube.com/watch?v=HKskNNr1nQs,Squid Game but it's FUNNIER than you remember,bobbletea,https://www.youtube.com/channel/UCkA-f9q79ydO5...,2024-12-31 23:58:03+03:00,HKskNNr1nQs,youtube#video,2021-10-22 13:58:52+00:00,"squid game,squid game funny,squid game edit,sq...",24,en,0 days 00:03:56,hd,2044342,76394,Entertainment
1,https://www.youtube.com/watch?v=mOjPGtvwR5Q,Idols lost visual?! #kpop #whatthekpop #twice ...,What The Kpop,https://www.youtube.com/channel/UCv56Iq3nT_8ce...,2024-12-31 23:57:59+03:00,mOjPGtvwR5Q,youtube#video,2024-12-07 16:10:49+00:00,,22,,0 days 00:00:57,hd,882564,58338,People & Blogs
2,https://www.youtube.com/watch?v=2JRFZAyn_ic,Larray & Quen Blackwell - “Scared” by BeatKing...,larrisgrace,https://www.youtube.com/channel/UCDW9iakqWF04I...,2024-12-31 23:57:10+03:00,2JRFZAyn_ic,youtube#video,2024-12-13 23:55:56+00:00,,24,en-US,0 days 00:00:15,hd,1181255,97246,Entertainment
3,https://www.youtube.com/watch?v=UyXfBI508m0,& now they’re going on tour together👑🤝👑 #kendr...,GROOVE ORDER,https://www.youtube.com/channel/UCZRM2X0hRbALZ...,2024-12-31 23:33:33+03:00,UyXfBI508m0,youtube#video,2024-12-18 20:00:23+00:00,,10,,0 days 00:01:00,hd,556936,44364,Music
4,https://www.youtube.com/watch?v=L7kyGbDICZ4,bambam and jessi talking about freezing eggs😂 ...,kuebskubs,https://www.youtube.com/channel/UCjmw_M1nFozrg...,2024-12-31 23:33:32+03:00,L7kyGbDICZ4,youtube#video,2023-10-29 15:31:37+00:00,,22,,0 days 00:00:53,hd,5718613,413125,People & Blogs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4089,https://www.youtube.com/watch?v=K2b9iGtQ5nA,Last Week of College Ever,Fred Liu,https://www.youtube.com/channel/UCCKidnYibSJ76...,2024-07-02 00:13:54+03:00,K2b9iGtQ5nA,youtube#video,2024-07-01 06:37:57+00:00,,22,zh-Hans,0 days 00:13:15,hd,653426,39099,People & Blogs
4090,https://www.youtube.com/watch?v=oB3023Cbe2I,Kenan Yıldız Ropörtaj,Futbol Gazetesi,https://www.youtube.com/channel/UCacYqWTI_FeQz...,2024-07-02 00:12:37+03:00,oB3023Cbe2I,youtube#video,2024-06-19 22:28:43+00:00,,22,,0 days 00:00:21,hd,190999,4874,People & Blogs
4091,https://www.youtube.com/watch?v=umr_q7XGqMk,Speed’in İstanbulda Yaşadığı Olay! #ishowspeed...,BucsDen,https://www.youtube.com/channel/UCefNXxWZDJB3y...,2024-07-02 00:08:57+03:00,umr_q7XGqMk,youtube#video,2023-12-20 16:11:33+00:00,"messi,ronaldo,lionel messi,messi skills,messi ...",22,tr,0 days 00:00:44,hd,7076796,244173,People & Blogs
4092,https://www.youtube.com/watch?v=DLn7GB30BnQ,These MITES live on your FACE!,Walt (oneminmicro),https://www.youtube.com/channel/UCGmfsRVSM9FG5...,2024-07-01 00:00:57+03:00,DLn7GB30BnQ,youtube#video,2023-01-26 13:08:14+00:00,,22,,0 days 00:00:54,hd,467734,30364,People & Blogs


In [168]:
watch_history_df.to_csv("youtube_watch_history.csv", index=False, encoding="utf-8")