# Parsing (Cleaning)

In [23]:
import pandas as pd
import json

In [24]:
INPUT = './data/history.json'
OUTPUT = './data/cleanHistory.csv'

In [25]:
with open(INPUT, 'r', encoding='utf-8') as f:
    data = json.load(f)
len(data)

7600

In [26]:
parsedData = []

In [27]:
for history in data:
    if 'titleUrl' not in history:
        continue
    row = {
        'title': (history.get('title', 'Unknown')).replace('Watched ', "", 1),
        'date': history.get('time'),
        'video_url': history.get('titleUrl'),
        'channel': 'Unknown',
    }

    if 'subtitles' in history and isinstance(history['subtitles'], list):
        if len(history['subtitles'])>0:
            row['channel'] = history['subtitles'][0].get("name", 'Unknown')
    parsedData.append(row)

len(parsedData)

7599

In [28]:
df = pd.DataFrame(parsedData)
df.head(10)

Unnamed: 0,title,date,video_url,channel
0,Ara Ke Othlali Mein - Slowed And Reverb | Pawa...,2026-01-29T17:22:46.857Z,https://www.youtube.com/watch?v=MHQ7YSO28ZY,Lofi Roxx
1,"Sajde - (16D Audio ""Not 8D"") | Faheem Abdullah...",2026-01-29T17:20:19.647Z,https://www.youtube.com/watch?v=NY4PJAuqCMY,ROYAL JAAT ♪
2,"Saiyaara Title Song (16D Audio ""Not 8D"") | Tan...",2026-01-29T17:14:05.588Z,https://www.youtube.com/watch?v=nNcrLdmuTg8,ROYAL JAAT ♪
3,"Barbaad (16D Audio ""Not 8D"") | Saiyaara | Ahaa...",2026-01-29T17:08:03.933Z,https://www.youtube.com/watch?v=OK2cQg7xbQc,ROYAL JAAT ♪
4,"GURU RANDHAWA - DOPAMINE (16D Audio ""Not 8D"") ...",2026-01-29T17:05:23.447Z,https://www.youtube.com/watch?v=TRfQMVuMzHs,ROYAL JAAT ♪
5,"Lutt Le Gaya (16D Audio ""Not 8D"") Dhurandhar |...",2026-01-29T17:01:11.335Z,https://www.youtube.com/watch?v=Rp6DOypeBtk,ROYAL JAAT ♪
6,TWITTER SENTIMENT ANALYSIS (NLP) | Machine Lea...,2026-01-29T16:43:44.061Z,https://www.youtube.com/watch?v=4YGkfAd2iXM,GeeksforGeeks
7,"Unstructured Data, Natural Language Processing...",2026-01-29T15:39:34.332Z,https://www.youtube.com/watch?v=fxAvc_1md44,Dr. Ahmad Bukhari
8,Introduction to R (Continued),2026-01-29T14:28:06.918Z,https://www.youtube.com/watch?v=Rftn5XkYAhk,NPTEL-NOC IITM
9,Low-Level Design Interview: Design Amazon Lock...,2026-01-29T14:09:23.342Z,https://www.youtube.com/watch?v=s6nGkoGJhXk,Hello Interview - SWE Interview Preparation


In [29]:
df['date'] = pd.to_datetime(df['date'], format="mixed")
df.head(10)

Unnamed: 0,title,date,video_url,channel
0,Ara Ke Othlali Mein - Slowed And Reverb | Pawa...,2026-01-29 17:22:46.857000+00:00,https://www.youtube.com/watch?v=MHQ7YSO28ZY,Lofi Roxx
1,"Sajde - (16D Audio ""Not 8D"") | Faheem Abdullah...",2026-01-29 17:20:19.647000+00:00,https://www.youtube.com/watch?v=NY4PJAuqCMY,ROYAL JAAT ♪
2,"Saiyaara Title Song (16D Audio ""Not 8D"") | Tan...",2026-01-29 17:14:05.588000+00:00,https://www.youtube.com/watch?v=nNcrLdmuTg8,ROYAL JAAT ♪
3,"Barbaad (16D Audio ""Not 8D"") | Saiyaara | Ahaa...",2026-01-29 17:08:03.933000+00:00,https://www.youtube.com/watch?v=OK2cQg7xbQc,ROYAL JAAT ♪
4,"GURU RANDHAWA - DOPAMINE (16D Audio ""Not 8D"") ...",2026-01-29 17:05:23.447000+00:00,https://www.youtube.com/watch?v=TRfQMVuMzHs,ROYAL JAAT ♪
5,"Lutt Le Gaya (16D Audio ""Not 8D"") Dhurandhar |...",2026-01-29 17:01:11.335000+00:00,https://www.youtube.com/watch?v=Rp6DOypeBtk,ROYAL JAAT ♪
6,TWITTER SENTIMENT ANALYSIS (NLP) | Machine Lea...,2026-01-29 16:43:44.061000+00:00,https://www.youtube.com/watch?v=4YGkfAd2iXM,GeeksforGeeks
7,"Unstructured Data, Natural Language Processing...",2026-01-29 15:39:34.332000+00:00,https://www.youtube.com/watch?v=fxAvc_1md44,Dr. Ahmad Bukhari
8,Introduction to R (Continued),2026-01-29 14:28:06.918000+00:00,https://www.youtube.com/watch?v=Rftn5XkYAhk,NPTEL-NOC IITM
9,Low-Level Design Interview: Design Amazon Lock...,2026-01-29 14:09:23.342000+00:00,https://www.youtube.com/watch?v=s6nGkoGJhXk,Hello Interview - SWE Interview Preparation


In [30]:
df.describe()

Unnamed: 0,title,date,video_url,channel
count,7599,7599,7599,7599
unique,6093,,6205,3486
top,California Love,,https://music.youtube.com/watch?v=eD8lmkBCma4,Unknown
freq,22,,22,397
mean,,2025-08-04 22:36:15.977662+00:00,,
min,,2025-02-01 17:08:23.913000+00:00,,
25%,,2025-04-29 17:25:40.273000+00:00,,
50%,,2025-07-19 17:18:26.651000+00:00,,
75%,,2025-12-05 10:55:36.368500+00:00,,
max,,2026-01-29 17:22:46.857000+00:00,,


In [31]:
df = df[df['title']!='Unknown']

In [32]:
f"Total videos watched: {len(df)}"

'Total videos watched: 7599'

In [33]:
f"Date Range: {df['date'].min()} to {df['date'].max()}"

'Date Range: 2025-02-01 17:08:23.913000+00:00 to 2026-01-29 17:22:46.857000+00:00'

In [34]:
f"Unique channels: {df['channel'].nunique()}"

'Unique channels: 3486'

In [35]:
df.to_csv(OUTPUT, index=False)