In [None]:
from bs4 import BeautifulSoup
import pandas as pd

In [33]:
with open('/content/watch_history.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

watched_videos = []

videos = soup.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp')

for video in videos:
    video_info = video.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')

    video_title_url = video_info.find('a')

    if video_title_url is not None:

      video_url = video_title_url['href']
      video_title = video_title_url.text.strip()

      channel_info = video_info.find_all('a')[1] if len(video_info.find_all('a')) > 1 else None
      channel_name = channel_info.text.strip() if channel_info else None
      channel_url = channel_info['href'] if channel_info else None

      text_nodes = video_info.stripped_strings
      time_watched = list(text_nodes)[-1].strip()

      ad_info = video.find('div', class_='content-cell mdl-cell mdl-cell--12-col mdl-typography--caption')
      is_ad = 'Google Reklamlar\'dan' in ad_info.text

      watched_videos.append({
          'videoURL': video_url,
          'title': video_title,
          'channelName': channel_name,
          'channelURL': channel_url,
          'timeWatched': time_watched,
          'isAd': is_ad
      })

watch_history_df = pd.DataFrame(watched_videos)

In [41]:
unavailable_videos = watch_history_df[watch_history_df['channelName'].isnull() & ~watch_history_df['isAd']].index
watch_history_df.drop(unavailable_videos, inplace=True)
watch_history_df.reset_index(drop=True, inplace=True)

print(watch_history_df.shape)

(5980, 6)


In [43]:
watch_history_df.head(5)

Unnamed: 0,videoURL,title,channelName,channelURL,timeWatched,isAd
0,https://www.youtube.com/watch?v=VQaKS0qGdwo,"Yeni Ülker Fındık Rüyası, Her Kaşıkta Haz Her ...",,,30 Kas 2024 13:59:27 GMT+03:00,True
1,https://www.youtube.com/watch?v=lItlvq5gNr4,Sabrina Carpenter asked a fan to teach her how...,Celebrity Gossip Central,https://www.youtube.com/channel/UCjqGg6y3H2G_A...,30 Kas 2024 13:48:58 GMT+03:00,False
2,https://www.youtube.com/watch?v=XGTN6lrGIU0,Who wants an XG REMIXX from @jacksonwang & @ci...,XG,https://www.youtube.com/channel/UC12HMtO5MYph9...,30 Kas 2024 13:48:51 GMT+03:00,False
3,https://www.youtube.com/watch?v=2LViqEbuQI0,Güçlü ve zinde olmak için neye ihtiyacım olduğ...,,,30 Kas 2024 13:48:41 GMT+03:00,True
4,https://www.youtube.com/watch?v=qOKjojFolDo,"%75 indirimle başla, yılın EN BÜYÜK indirimiyl...",,,30 Kas 2024 13:19:29 GMT+03:00,True


In [46]:
print(f"Number of videos without titles: {(watch_history_df['title'] == '').sum()}")

Number of videos without titles: 0


In [52]:
month_abbreviations = {
    'Haz': 'Jun', 'Tem': 'Jul', 'Ağu': 'Aug', 'Eyl': 'Sep', 'Eki': 'Oct',
    'Kas': 'Nov', 'Ara': 'Dec'
}

def map_month_name(timestamp):
    for turkish_month, english_month in month_abbreviations.items():
        timestamp = timestamp.replace(turkish_month, english_month)
    return timestamp

watch_history_df['timeWatched'] = watch_history_df['timeWatched'].apply(map_month_name)

watch_history_df['timeWatched'] = pd.to_datetime(watch_history_df['timeWatched'], format='%d %b %Y %H:%M:%S GMT%z')

In [54]:
watch_history_df.dtypes

Unnamed: 0,0
videoURL,object
title,object
channelName,object
channelURL,object
timeWatched,"datetime64[ns, UTC+03:00]"
isAd,bool
