# Importing Libraries

In [1]:
from googleapiclient.discovery import build
import pandas as pd
import pickle
import os
from dotenv import load_dotenv

# Establishing API Connection

In [3]:
load_dotenv()

api_key = os.getenv('API_KEY')

youtube = build('youtube', 'v3', developerKey=api_key)

# Importing Chosen Playlist IDs

In [6]:
def chosen_playlist_ids():

    with open('chosen_playlist_ids.pkl', 'rb') as file:
        playlist_ids = pickle.load(file)
    
    return playlist_ids

In [7]:
playlist_ids = chosen_playlist_ids()

# Grabbing Video IDs

In [9]:
def mass_video_ids(youtube, wanted_playlist_ids):

    all_data = []

    for id in wanted_playlist_ids:
        
        nextpage = True
        page = None

        while nextpage:
        # request for first page of playlist
            request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = id,
                maxResults=50,
                pageToken=page)
            # dictionary of the first page   
            response = request.execute()

        # Accessing videos ids on that first page
            for i in range(len(response['items'])):
                data = dict(playlist_id = id,
                    video_id = response['items'][i]['contentDetails']['videoId'])
                all_data.append(data)

            # Checking to see if there is a next page
            if 'nextPageToken' in response:
                page = response['nextPageToken']              

            else:
                nextpage = False
    # Create df
    all_data_df = pd.DataFrame(all_data)

    return all_data_df

In [10]:
video_ids = mass_video_ids(youtube, playlist_ids)

In [11]:
len(video_ids)

9322

In [17]:
video_ids.head()

Unnamed: 0,playlist_id,video_id
0,UUvZnwzmc3m1Eush-Or8Z6DA,3fqTNzXY5tg
1,UUvZnwzmc3m1Eush-Or8Z6DA,bgVu5WVR9SE
2,UUvZnwzmc3m1Eush-Or8Z6DA,4we3smhjAB8
3,UUvZnwzmc3m1Eush-Or8Z6DA,lpF5SSgczeE
4,UUvZnwzmc3m1Eush-Or8Z6DA,cRVM-LTe3fI


# Getting Video Stats

In [12]:
video_ids_list = list(video_ids['video_id'])

In [13]:
def get_video_stats(youtube, list_of_video_ids):

    all_data = []

    # Will only return 50 results at a time, this for loop will do 50 Video IDs at a time
    for i in range(0, len(list_of_video_ids), 50):     

        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(list_of_video_ids[i:i+50]))

        response = request.execute()

        for video in response['items']:
  
            data = dict(video_id = video['id'],
                        channel_id = video['snippet']['channelId'],
                        published_at = video['snippet'].get('publishedAt'),
                        title = video['snippet'].get('title'),
                        description = video['snippet'].get('description'),
                        tags = video['snippet'].get('tags', []), # was issue here as not all videos had tags, .get() might generally offer better protection
                        category_id = video['snippet'].get('categoryId'),
                        duration = video['contentDetails'].get('duration'),
                        caption = video['contentDetails'].get('caption'),
                        licensed_content = video['contentDetails'].get('licensedContent'),
                        default_language = video['snippet'].get('defaultLanguage'),
                        content_rating = video['contentDetails'].get('contentRating'),
                        view_count = video['statistics'].get('viewCount'),
                        like_count = video['statistics'].get('likeCount'), # issue 
                        favourite_count = video['statistics'].get('favoriteCount'),
                        comment_count = video['statistics'].get('commentCount'))

            all_data.append(data)

    all_data_df = pd.DataFrame(all_data)

    return all_data_df

In [14]:
video_stats_df = get_video_stats(youtube, video_ids_list)

In [15]:
video_stats_df.head()

Unnamed: 0,video_id,channel_id,published_at,title,description,tags,category_id,duration,caption,licensed_content,default_language,content_rating,view_count,like_count,favourite_count,comment_count
0,3fqTNzXY5tg,UCvZnwzmc3m1Eush-Or8Z6DA,2023-02-19T14:00:02Z,Using Code and GPT-3 to Learn Faster,Thanks to ProjectPro.io for their support: htt...,[],27,PT18M6S,False,True,,{},6871,184,0,23
1,bgVu5WVR9SE,UCvZnwzmc3m1Eush-Or8Z6DA,2022-11-04T03:32:38Z,Data Analyst MENTORSHIP - Q&A (while I drink ...,⬇️⬇️⬇️Check here prior to asking your question...,[],27,PT29M22S,False,True,,{},3723,184,0,9
2,4we3smhjAB8,UCvZnwzmc3m1Eush-Or8Z6DA,2022-11-01T16:30:09Z,How Data Science ACTUALLY Works,Check out Deepnote for the easiest way to prac...,[],27,PT26M50S,False,True,,{},85152,2647,0,136
3,lpF5SSgczeE,UCvZnwzmc3m1Eush-Or8Z6DA,2022-10-25T14:00:07Z,Does Instagram think you live in an influentia...,Request this and many other datasets @: https:...,[],27,PT1H24M8S,False,True,,{},4470,158,0,12
4,cRVM-LTe3fI,UCvZnwzmc3m1Eush-Or8Z6DA,2022-10-07T03:36:26Z,Data Analyst MENTORSHIP - Q&A (while I drink ...,⬇️⬇️⬇️Check here prior to asking your question...,[],27,PT26M55S,False,True,,{},3078,104,0,4


# Creating Channel Stats Dataframe

In [24]:
def get_channel_stats(youtube, channel_ids):
    
    all_data = []
    
    request = youtube.channels().list(part='snippet,contentDetails,statistics', id=','.join(channel_ids))
    
    response = request.execute()
    
    for i in range(len(response['items'])):
        data = dict(channel_name = response['items'][i]['snippet']['title'],
                channel_id = response['items'][i]['id'],
                subscribers = response['items'][i]['statistics']['subscriberCount'],
                total_views = response['items'][i]['statistics']['viewCount'],
                total_videos = response['items'][i]['statistics']['videoCount'],
                playlist_id = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return all_data

In [20]:
channel_ids_list = list(video_stats_df['channel_id'].unique())

In [22]:
len(channel_ids_list)

22

In [27]:
channel_stats_df = pd.DataFrame(get_channel_stats(youtube, channel_ids_list))

In [29]:
channel_stats_df

Unnamed: 0,channel_name,channel_id,subscribers,total_views,total_videos,playlist_id
0,Keith Galli,UCq6XkhO5SZ66N04IcPbqNcw,209000,14309460,80,UUq6XkhO5SZ66N04IcPbqNcw
1,DataCamp,UC79Gv3mYp6zKiSwYemEik9A,155000,22950468,1532,UU79Gv3mYp6zKiSwYemEik9A
2,Great Learning,UCObs0kLIrDjX2LLSybqNaEA,809000,63094287,1857,UUObs0kLIrDjX2LLSybqNaEA
3,Data Professor,UCV8e2g4IWQqK71bbzGDEI4Q,171000,5502842,325,UUV8e2g4IWQqK71bbzGDEI4Q
4,Shashank Kalanithi,UCvZnwzmc3m1Eush-Or8Z6DA,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA
5,Sundas Khalid,UCteRPiisgIoHtMgqHegpWAQ,185000,9376789,116,UUteRPiisgIoHtMgqHegpWAQ
6,Tech Classes,UCPvDKIsrjA_h3g5yZJwCIHA,30000,1954008,156,UUPvDKIsrjA_h3g5yZJwCIHA
7,Thu Vu data analytics,UCJQJAI7IjbLcpsjWdSzYz0Q,179000,5981075,79,UUJQJAI7IjbLcpsjWdSzYz0Q
8,Krish Naik,UCNU_lfiiWBdtULKOw6X0Dig,851000,86329667,1758,UUNU_lfiiWBdtULKOw6X0Dig
9,Nicholas Renotte,UCHXa4OpASJEwrHrLeIzw7Yg,220000,14187082,303,UUHXa4OpASJEwrHrLeIzw7Yg


# Combining DataFrames

In [30]:
df = pd.merge(video_stats_df, channel_stats_df, on='channel_id', how='left')

In [31]:
df

Unnamed: 0,video_id,channel_id,published_at,title,description,tags,category_id,duration,caption,licensed_content,...,content_rating,view_count,like_count,favourite_count,comment_count,channel_name,subscribers,total_views,total_videos,playlist_id
0,3fqTNzXY5tg,UCvZnwzmc3m1Eush-Or8Z6DA,2023-02-19T14:00:02Z,Using Code and GPT-3 to Learn Faster,Thanks to ProjectPro.io for their support: htt...,[],27,PT18M6S,false,True,...,{},6871,184,0,23,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA
1,bgVu5WVR9SE,UCvZnwzmc3m1Eush-Or8Z6DA,2022-11-04T03:32:38Z,Data Analyst MENTORSHIP - Q&A (while I drink ...,⬇️⬇️⬇️Check here prior to asking your question...,[],27,PT29M22S,false,True,...,{},3723,184,0,9,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA
2,4we3smhjAB8,UCvZnwzmc3m1Eush-Or8Z6DA,2022-11-01T16:30:09Z,How Data Science ACTUALLY Works,Check out Deepnote for the easiest way to prac...,[],27,PT26M50S,false,True,...,{},85152,2647,0,136,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA
3,lpF5SSgczeE,UCvZnwzmc3m1Eush-Or8Z6DA,2022-10-25T14:00:07Z,Does Instagram think you live in an influentia...,Request this and many other datasets @: https:...,[],27,PT1H24M8S,false,True,...,{},4470,158,0,12,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA
4,cRVM-LTe3fI,UCvZnwzmc3m1Eush-Or8Z6DA,2022-10-07T03:36:26Z,Data Analyst MENTORSHIP - Q&A (while I drink ...,⬇️⬇️⬇️Check here prior to asking your question...,[],27,PT26M55S,false,True,...,{},3078,104,0,4,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9317,4rfr6A3lO-Y,UC7cs8q-gJRlGwj4A8OmCmXg,2020-01-30T14:07:55Z,Data Analyst Resume | Reviewing My Resume! | F...,Data Analyst Resume | Reviewing My Resume! | F...,"[Data Analyst, How to become a data analyst, D...",27,PT7M33S,false,True,...,{},66316,1574,0,64,Alex The Analyst,647000,27519398,270,UU7cs8q-gJRlGwj4A8OmCmXg
9318,OTq2NRy_AGs,UC7cs8q-gJRlGwj4A8OmCmXg,2020-01-25T16:38:39Z,Working at a Big Company Vs Small Company | To...,Working at a Big Company Vs Small Company | To...,"[Data Analyst, How to become a Data Analyst, B...",22,PT5M50S,false,True,...,{},14527,397,0,20,Alex The Analyst,647000,27519398,270,UU7cs8q-gJRlGwj4A8OmCmXg
9319,ya28cb3zFGE,UC7cs8q-gJRlGwj4A8OmCmXg,2020-01-23T03:16:09Z,Data Analyst Salary | 100k with No Experience,Data Analyst Salary | 100k with No Experience ...,"[Data Analyst Salary, Data analyst with no exp...",22,PT5M3S,false,True,...,{},62166,2153,0,227,Alex The Analyst,647000,27519398,270,UU7cs8q-gJRlGwj4A8OmCmXg
9320,Hsi2BG0SOiQ,UC7cs8q-gJRlGwj4A8OmCmXg,2020-01-21T03:52:15Z,Truth About Big Companies | Told by a Fortune ...,Truth About Big Companies // There are a ton o...,"[Working at a big company, Big company data an...",22,PT5M45S,false,True,...,{},8157,305,0,18,Alex The Analyst,647000,27519398,270,UU7cs8q-gJRlGwj4A8OmCmXg


# Saving DataFrame

In [32]:
with open('new_raw_data.pkl', 'wb') as file:
    pickle.dump(df, file)