In [15]:
import pandas as pd
import numpy as np
import json
import requests
import os

In [16]:
# load the playlists data from the csv
playlists = pd.read_csv('playlist-clean.csv')
playlists.head()

Unnamed: 0.1,Unnamed: 0,url,tweet_created_at,twitter_user_id,list_id
0,0,https://youtube.com/playlist?list=PL6BT4U83o4J...,2021-06-04 12:58:58 UTC,1.4e+18,PL6BT4U83o4Je3TDUWwkPYazR7FY9CtySw
1,1,https://www.youtube.com/playlist?list=PLDVYaMO...,2018-12-14 05:27:20 UTC,1.07e+18,PLDVYaMOAufIDGuZ2p50Q9EO5f1gNUcUIZ
2,2,https://www.youtube.com/playlist?list=PLEeIGZu...,2022-10-17 03:51:24 UTC,1.58e+18,PLEeIGZuBXtsuRJRtQH2P8xUGi6AR_MN4w
3,3,https://www.youtube.com/playlist?list=PLhOQAKe...,2019-04-01 02:30:05 UTC,1.11e+18,PLhOQAKeRPuGkhS5mSDgdXmFV-h9mRMf24
4,4,https://www.youtube.com/playlist?list=PLEeIGZu...,2022-10-20 04:51:09 UTC,1.58e+18,PLEeIGZuBXtstpI4hORi6_exCErOMyl-9_


In [17]:
# Droping the extra column and renaming the twitter user id and created at column
playlists.drop(['Unnamed: 0'], axis=1, inplace=True)
playlists.rename(columns = {'id':'twitter_user_id', 'created_at': 'tweet_created_at'}, inplace = True)
playlists.head()

Unnamed: 0,url,tweet_created_at,twitter_user_id,list_id
0,https://youtube.com/playlist?list=PL6BT4U83o4J...,2021-06-04 12:58:58 UTC,1.4e+18,PL6BT4U83o4Je3TDUWwkPYazR7FY9CtySw
1,https://www.youtube.com/playlist?list=PLDVYaMO...,2018-12-14 05:27:20 UTC,1.07e+18,PLDVYaMOAufIDGuZ2p50Q9EO5f1gNUcUIZ
2,https://www.youtube.com/playlist?list=PLEeIGZu...,2022-10-17 03:51:24 UTC,1.58e+18,PLEeIGZuBXtsuRJRtQH2P8xUGi6AR_MN4w
3,https://www.youtube.com/playlist?list=PLhOQAKe...,2019-04-01 02:30:05 UTC,1.11e+18,PLhOQAKeRPuGkhS5mSDgdXmFV-h9mRMf24
4,https://www.youtube.com/playlist?list=PLEeIGZu...,2022-10-20 04:51:09 UTC,1.58e+18,PLEeIGZuBXtstpI4hORi6_exCErOMyl-9_


In [18]:
playlists.isna().sum()

url                 0
tweet_created_at    0
twitter_user_id     0
list_id             0
dtype: int64

In [19]:
playlists.isnull().sum()

url                 0
tweet_created_at    0
twitter_user_id     0
list_id             0
dtype: int64

In [20]:
# the data is clean
# urls_with_empty_id = playlists[playlists['list_id'].isnull() | playlists['list_id'].eq('')]['url']
# print(urls_with_empty_id)

In [21]:
# playlists.dropna(subset=['list_id'], inplace=True)
# playlists

In [22]:
playlists['list_id'][0:3]

0    PL6BT4U83o4Je3TDUWwkPYazR7FY9CtySw
1    PLDVYaMOAufIDGuZ2p50Q9EO5f1gNUcUIZ
2    PLEeIGZuBXtsuRJRtQH2P8xUGi6AR_MN4w
Name: list_id, dtype: object

In [23]:
# Creating subsets of data - list_id
ids = playlists['list_id']

p1 = ids[0:1]
p3 = ids[0:3]
p10 = ids[0:10]
p100 = ids[0:100]

d1 = ids[0:1000]
d2 = ids[1000:2000]
d3 = ids[2000:3000]
d4 = ids[3000:]

p = playlists
p1

0    PL6BT4U83o4Je3TDUWwkPYazR7FY9CtySw
Name: list_id, dtype: object

In [66]:
# yt data api_key
with open('api_key.txt', 'r') as f:
    api_key = f.read()

In [50]:
''' Function that takes a list of playlist_ids as input and an api_key and saves all the data in a 
    playlist_data.json file. It uses a try except block for error handling and also checks for a 
    id which is already fetched making less api calls '''

def fetch_playlist_videos_data(playlist_ids, api_key):
    data_file = 'playlist_data.json'
    all_data = {}

    # Load existing data from the JSON file
    if os.path.exists(data_file):
        with open(data_file, 'r') as file:
            all_data = json.load(file)

    for playlist_id in playlist_ids:
        # Check if playlist data already exists in the JSON
        if (playlist_id in all_data) and all_data[playlist_id] != None:
            print(f"Playlist ID {playlist_id} data already fetched. Skipping...")
            continue

        playlist_data = []
        next_page_token = None

        try:
            while True:
                url = f"https://www.googleapis.com/youtube/v3/playlistItems?part=snippet%2CcontentDetails%2Cid&maxResults=50&playlistId={playlist_id}&key={api_key}"
                if next_page_token:
                    url += f"&pageToken={next_page_token}"

                response = requests.get(url)
                data = json.loads(response.text)

                if 'error' in data and data['error']['message'] == 'This playlist type is unviewable.':
                    print(f"Playlist ID {playlist_id} is unviewable. Storing null value...")
                    all_data[playlist_id] = None
                    break

                for item in data['items']:
                    video_data = {}
                    video_id = item['contentDetails']['videoId']
                    video_data['snippet'] = item['snippet']
                    # Add any additional data you want to include for each video

                    playlist_data.append({video_id: video_data})

                next_page_token = data.get('nextPageToken')
                if not next_page_token:
                    break

        except Exception as e:
            print(f"Error fetching data for playlist ID {playlist_id}: {str(e)}")
            all_data[playlist_id] = None
            continue

        all_data[playlist_id] = playlist_data

        # Save the updated data to the JSON file
        with open(data_file, 'w') as file:
            json.dump(all_data, file, indent=4)

    return all_data


In [51]:
# testing for 1st id - the playlist has 6 videos
one = {}
one = fetch_playlist_videos_data(p1, api_key=api_key)
len(one[p1[0]])

Playlist ID PL6BT4U83o4Je3TDUWwkPYazR7FY9CtySw data already fetched. Skipping...


6

In [52]:
# testing for three ids
three = {}
three = fetch_playlist_videos_data(p3, api_key=api_key)

Playlist ID PL6BT4U83o4Je3TDUWwkPYazR7FY9CtySw data already fetched. Skipping...
Playlist ID PLDVYaMOAufIDGuZ2p50Q9EO5f1gNUcUIZ data already fetched. Skipping...
Playlist ID PLEeIGZuBXtsuRJRtQH2P8xUGi6AR_MN4w data already fetched. Skipping...


In [53]:
# testing for ten ids
ten = {}
ten = fetch_playlist_videos_data(p10, api_key=api_key)

Playlist ID PL6BT4U83o4Je3TDUWwkPYazR7FY9CtySw data already fetched. Skipping...
Playlist ID PLDVYaMOAufIDGuZ2p50Q9EO5f1gNUcUIZ data already fetched. Skipping...
Playlist ID PLEeIGZuBXtsuRJRtQH2P8xUGi6AR_MN4w data already fetched. Skipping...
Playlist ID PLhOQAKeRPuGkhS5mSDgdXmFV-h9mRMf24 data already fetched. Skipping...
Playlist ID PLEeIGZuBXtstpI4hORi6_exCErOMyl-9_ data already fetched. Skipping...
Error fetching data for playlist ID OLAK5uy_lNmQS3xZB7drt-VYb_N2R3J0r0hF9LE-A: 'items'
Error fetching data for playlist ID PLuxOyO4_YL7dJNjTiWG3YV0jKSPVQw2mC: 'items'
Playlist ID PLtO3qLgs5z6ZKmCtEZKkZ8prFz_fyxZVH data already fetched. Skipping...
Playlist ID OLAK5uy_k-OqjWpYJMrr50-l_9tOhdA7omj5Ce8yw data already fetched. Skipping...
Playlist ID PLMe77kzo_oJ1tKAG07Bb3qp7HUUSoU7wd data already fetched. Skipping...


In [54]:
# testing for 100 ids - the playlists have
hund = {}
hund = fetch_playlist_videos_data(p100, api_key=api_key)
len(hund)

Playlist ID PL6BT4U83o4Je3TDUWwkPYazR7FY9CtySw data already fetched. Skipping...
Playlist ID PLDVYaMOAufIDGuZ2p50Q9EO5f1gNUcUIZ data already fetched. Skipping...
Playlist ID PLEeIGZuBXtsuRJRtQH2P8xUGi6AR_MN4w data already fetched. Skipping...
Playlist ID PLhOQAKeRPuGkhS5mSDgdXmFV-h9mRMf24 data already fetched. Skipping...
Playlist ID PLEeIGZuBXtstpI4hORi6_exCErOMyl-9_ data already fetched. Skipping...
Error fetching data for playlist ID OLAK5uy_lNmQS3xZB7drt-VYb_N2R3J0r0hF9LE-A: 'items'
Error fetching data for playlist ID PLuxOyO4_YL7dJNjTiWG3YV0jKSPVQw2mC: 'items'
Playlist ID PLtO3qLgs5z6ZKmCtEZKkZ8prFz_fyxZVH data already fetched. Skipping...
Playlist ID OLAK5uy_k-OqjWpYJMrr50-l_9tOhdA7omj5Ce8yw data already fetched. Skipping...
Playlist ID PLMe77kzo_oJ1tKAG07Bb3qp7HUUSoU7wd data already fetched. Skipping...
Playlist ID PL_s0J6qvOS_gtLDqmtBg2MDfWez4L2SmQ data already fetched. Skipping...
Playlist ID PLG7EoBMUD1JwbD5-MQpFRGvadGWtf-4yD data already fetched. Skipping...
Playlist ID PLV4

93

In [40]:
len(hund)

93

In [56]:
# Refetching the Null List because there are playlists which have videos but are not fetched
null_list = [id for id in all_data.keys() if all_data[id] == None]
null_list

['OLAK5uy_lNmQS3xZB7drt-VYb_N2R3J0r0hF9LE-A',
 'PLuxOyO4_YL7dJNjTiWG3YV0jKSPVQw2mC',
 'RDp_ZQwfcBT_A',
 'PL22J3VaeABQC6oShltYc6rMUDAGJ7_G5e',
 'PLHskeNSWVhbXCU-gdcDUZ6mcHGDk4XkPp',
 'PLpcouECH0Gx7ahhvWFymhW4O9eMQ9dL5k',
 'RDY9PFjhbpipc',
 'RD-mAKYa3yu0g',
 'PLuxOyO4_YL7cl0S7k0glLcI9sBRGOzs_p',
 'PLKG2bf2iksPrEv2fgFNmfDk_UgaY92Zm0',
 'PLieBmJXd4EnAdPrdYyRsQ4oV8zqmHrwRm',
 'PL7WuOfC7AFSxIskBPpmrA-Ns4NRjFBqiQ',
 'PLD9D50924B5301E95',
 'RDJb15vFuw6xA',
 'OLAK5uy_lGXW0aofOl-_6fh_hX9ZLQEnKneJs0qoA',
 'PLMe77kzo_oJ1Pz8lcgq_bedGK-C9u-at1',
 'PL-ujIE4tLGbEv2nmpcjaLxOtTb6hWle_C',
 'PLuyu8cy367LdmSAoHibh2oVmfINV94Lp5',
 'PLTplBPPoWdX2dsFq7tcFw9xPNqn8JMp9k',
 'UUqH_xse717F6_W3U5MZOZhQ',
 'PLGxYMgBcfMmqzcW49Lrp0Nlh4IgNMIpr9',
 'OLAK5uy_kK6XKS4P27sPw1-txp0Hdh2L-9S5M7HoM',
 'PLzt9cBFpA-06vBAIFNdPWFH-d4JC9PBwW',
 'RD4DF8LCgiWAk',
 'PLQ2XHcwJDoGPOeDn9sQ8Q910iPwgzc_8X',
 'PL4GxhQfhgMEHjPnKY818MxYuAcXRIj9oC',
 'PLhxITUni9OWes4XXblfTGR0RcbKgoGGKB',
 'PLHKEMR7Ly5naYB7NfVT1haeMStP0m25VG',
 'PL8Ia5e5Q_mteAWS

In [57]:
len(null_list)

32

In [58]:
null_list_data = {}
null_list_data = fetch_playlist_videos_data(null_list, api_key=api_key)

Error fetching data for playlist ID OLAK5uy_lNmQS3xZB7drt-VYb_N2R3J0r0hF9LE-A: 'items'
Error fetching data for playlist ID PLuxOyO4_YL7dJNjTiWG3YV0jKSPVQw2mC: 'items'
Error fetching data for playlist ID RDp_ZQwfcBT_A: 'items'
Error fetching data for playlist ID PL22J3VaeABQC6oShltYc6rMUDAGJ7_G5e: 'items'
Error fetching data for playlist ID PLHskeNSWVhbXCU-gdcDUZ6mcHGDk4XkPp: 'items'
Error fetching data for playlist ID PLpcouECH0Gx7ahhvWFymhW4O9eMQ9dL5k: 'items'
Error fetching data for playlist ID RDY9PFjhbpipc: 'items'
Error fetching data for playlist ID RD-mAKYa3yu0g: 'items'
Error fetching data for playlist ID PLuxOyO4_YL7cl0S7k0glLcI9sBRGOzs_p: 'items'
Error fetching data for playlist ID PLKG2bf2iksPrEv2fgFNmfDk_UgaY92Zm0: 'items'
Error fetching data for playlist ID PLieBmJXd4EnAdPrdYyRsQ4oV8zqmHrwRm: 'items'
Error fetching data for playlist ID PL7WuOfC7AFSxIskBPpmrA-Ns4NRjFBqiQ: 'items'
Error fetching data for playlist ID PLD9D50924B5301E95: 'items'
Error fetching data for playlist

In [60]:
# To fetch the data from teh aove 32 videos we need an authorization token
headers = {
    'Authorization': 'YOUR_AUTH_HEADER',
    'Content-Type': 'application/json'
}
# response = requests.get(url, headers=headers)
# data = json.loads(response.text)

In [62]:
# this script is written to change/replace all the empty lists [] by None

# data_file = 'playlist_data.json'
# all_data = {}

# # Load existing data from the JSON file
# if os.path.exists(data_file):
#     with open(data_file, 'r') as file:
#         all_data = json.load(file)
# print('Read File')

# for id in all_data.keys():
#     if all_data[id] == []:
#         all_data[id] = None
# print('changes made')

# with open(data_file, 'w') as file:
#     json.dump(all_data, file, indent=4)
# print('write done')