In [1]:
import os
from pymongo import MongoClient
import pandas as pd
import re
import json
from googleapiclient.discovery import build
from datetime import timedelta

#### Importing videos csv

In [None]:
videos_df = pd.read_csv('./csvs/modified_videos_xx.csv')     # last updated videos dataset

In [None]:
videos_df.info()

#### Getting list of youtube_id for available videos missing duration in live dataset 

In [None]:
videos_YTids_noDuration_list = list(videos_df.loc[videos_df['duration'].isna(),'youtube_id'])
videos_YTids_noDuration_list[:5]

### YouTube API key

In [None]:
developer_key = os.environ.get('DEVELOPER_KEY')

youtube = build('youtube', 'v3', developerKey=developer_key) # project-0127 Data Cleaning

#### Fetching duration of videos

In [None]:
part_string = 'contentDetails'
videos_youtube_id = videos_YTids_noDuration_list

In [None]:
hours_pattern = re.compile(r'(\d+)H')
minutes_pattern = re.compile(r'(\d+)M')
seconds_pattern = re.compile(r'(\d+)S')

In [None]:
videos_duration_dict = {}
for id in range(0,len(videos_youtube_id)):

    try:
        request = youtube.videos().list(
            part=part_string,
            id=videos_youtube_id[id],
        )
        response = request.execute()

        if response['items']!=[] and response['items'][0]['contentDetails']['duration'] != '':

            duration = response['items'][0]['contentDetails']['duration']

            hours = hours_pattern.search(duration)
            minutes = minutes_pattern.search(duration)
            seconds = seconds_pattern.search(duration)

            hours = int(hours.group(1)) if hours else 0
            minutes = int(minutes.group(1)) if minutes else 0
            seconds = int(seconds.group(1)) if seconds else 0

            videos_seconds = timedelta(
                hours=hours,
                minutes=minutes,
                seconds=seconds
            ).total_seconds()
        
        else:
            videos_seconds = None

    except Exception as e:
        print("An exception occured")
        print(e)
        videos_seconds = None
        
    videos_duration_dict[videos_youtube_id[id]] = videos_seconds    

In [None]:
duration = open('./youtube_api/videos_duration.txt',"a")
duration.write(json.dumps(videos_duration_dict))
duration.close()

### Connection to MongoDB

In [3]:
mongodb_uri = os.environ.get('MONGODB_URI')
database_name = os.environ.get('DATABASE_NAME')

client = MongoClient(mongodb_uri)
database = client[database_name]

#### Connecting to videos collection

In [None]:
videos = database['modified_videos_xx']     # last updated videos collection

#### Updating video duration

In [None]:
try:
    for (key,value) in videos_duration_dict.items():
        videos.update_many(
        {"youtube_id" : key},
        {"$set": { "duration" : value}}
        );
except Exception as e:
    print(exception)