# nyt_youtube_api_project  - Python

## By: Gabriela Sanchez

In [1]:
# Importing essential libraries
from googleapiclient.discovery import build
import pandas as pd
from IPython.display import JSON
from datetime import datetime
import isodate

#Data viz packages
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

#NLP
from wordcloud import WordCloud
from nltk.corpus import stopwords

In [3]:
# Get credentials and create an API client

api_key = ''

channel_ids = ['UCqnbDFdCpuN8CMEg0VuEBqA']

api_service_name = "youtube"
api_version = "v3"

youtube = build(
    api_service_name, api_version, developerKey=api_key)

In [None]:
#Function to get channel dataframe stats

def get_channel_stats(youtube, channel_ids):
    all_data = []  


    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()


    for item in response['items']:
        data = {
            'Channel_Name': item['snippet']['title'],
            'Subscribers': item['statistics']['subscriberCount'],
            'Views': item['statistics']['viewCount'],
            'Total_Videos': item['statistics']['videoCount'],
            'Playlist_Id': item['contentDetails']['relatedPlaylists']['uploads']
        }
        all_data.append(data)  
        
    return pd.DataFrame(all_data)  

In [None]:
channel_stats = get_channel_stats(youtube, channel_ids)

In [None]:
#channel_stats retrieve a dataframe of the channel id requested

channel_stats

In [None]:
#get_video_ids retrieves all video IDs from a specified YouTube playlist


playlist_id = 'UUqnbDFdCpuN8CMEg0VuEBqA'


def get_video_ids(youtube, playlist_id):
    video_ids = []

    # Initial request
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlist_id,
        maxResults=50
    )
    response = request.execute()

    # Collect video IDs from the first page
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])

    # Loop through additional pages
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        # Update next_page_token for the next iteration
        next_page_token = response.get('nextPageToken')
        
    return video_ids

In [None]:
video_ids = get_video_ids(youtube, playlist_id)

In [None]:
# Retrieving Youtube Statistic for each video id


def get_video_details(youtube, video_ids):
    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {
                'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                'statistics': ['viewCount', 'likeCount', 'favoriteCount', 'commentCount'],
                'contentDetails': ['duration', 'definition', 'caption']
            }
            video_info = {'video_id': video['id']}

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except KeyError:
                        video_info[v] = None

            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)


In [None]:
video_df = get_video_details(youtube, video_ids) 
video_df

## Data Pre-processing

In [None]:
#Exploring data for any null values
video_df.isnull().any()

In [None]:
video_df.dtypes

In [None]:
#transforming columns
numeric_cols = ['viewCount','likeCount','favoriteCount','commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)
video_df['publishedAt'] = pd.to_datetime(video_df['publishedAt'], format="%Y-%m-%dT%H:%M:%SZ")
video_df = video_df[video_df['publishedAt'].dt.year == 2024]

In [None]:
#Converting duration to seconds

video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [None]:
# Verify columns
video_df[['durationSecs','duration']]

In [None]:
# Add tag count
video_df['tagCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))
video_df

## EDA

### Best performing videos

In [None]:
#Creating barplot for the best performing videos
ax = sns.barplot(x='title', y='viewCount', data=video_df.sort_values('viewCount', ascending=False).iloc[:9])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.1f}M'.format(x / 1e6)))

ax.set_title("Top 9 Most Viewed New York Times YouTube Videos in 2024", fontsize=16)

### Worst performing videos

In [None]:
#Creating barplot for the worst performing videos
ax = sns.barplot(x='title', y='viewCount', data=video_df.sort_values('viewCount', ascending=True).iloc[:9])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}K'.format(x / 1000)))

ax.set_title("9 Least Viewed New York Times YouTube Videos in 2024", fontsize=16)

## Views vs. likes and comments

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Plotting Views vs. Comments
sns.scatterplot(data=video_df, x='commentCount', y='viewCount', ax=ax[0])
ax[0].set_title("Views vs. Comments")
ax[0].yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: '{:.1f}M'.format(x / 1e6)))
ax[0].set_xlabel("Comment Count")
ax[0].set_ylabel("View Count (Millions)")

# Plotting Views vs. Likes
sns.scatterplot(data=video_df, x='likeCount', y='viewCount', ax=ax[1])
ax[1].set_title("Views vs. Likes")
ax[1].yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: '{:.1f}M'.format(x / 1e6)))
ax[1].set_xlabel("Like Count")
ax[1].set_ylabel("")

# Adjust layout
plt.tight_layout()
plt.show()

## Video Duration

In [None]:
#Creating histogram

sns.histplot(data = video_df, x = 'durationSecs', bins=30)

## Word Cloud

In [None]:
import nltk
import re 

# Download the stopwords resource
nltk.download('stopwords')

# Now you can import stopwords from nltk.corpus
from nltk.corpus import stopwords



import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from titles
def clean_title(title):
    cleaned_title = re.sub(r'U\.S\.', '', re.sub(r'\s*[\|–-].*$', '', title))
    return cleaned_title

video_df['cleaned_title'] = video_df['title'].apply(clean_title)


video_df['title_no_stopwords'] = video_df['cleaned_title'].apply(lambda x: [item for item in str(x).split() if item.lower() not in stop_words])

# Create a list of all words
all_words = [a for b in video_df['title_no_stopwords'].tolist() for a in b]
all_words_str = ' '.join(all_words)  # Use space instead of empty string

def plot_cloud(wordcloud):
    plt.figure(figsize=(30, 20))
    plt.imshow(wordcloud, interpolation='bilinear')  # Added interpolation for smoother display
    plt.axis("off")
    plt.show()  # Added show to display the plot

# Generate the word cloud
wordcloud = WordCloud(width=2000, height=1000, random_state=1, background_color='white',
                     colormap='gist_earth', collocations=False).generate(all_words_str)

# Plot the word cloud
plot_cloud(wordcloud)


In [None]:
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re


nltk.download('stopwords')

# Eliminating stopwords from NYT sections
custom_stopwords = {'nyt', 'opinion', 'scene','anatomy','watch','op','docs'}
stop_words = set(stopwords.words('english')).union(custom_stopwords)


def clean_title(title):
    cleaned_title = re.sub(r'U\.S\.', '', re.sub(r'\s*[\|–-].*$', '', title))
    return cleaned_title

video_df['cleaned_title'] = video_df['title'].apply(clean_title)


#New list of stopwords to get trending topics
video_df['title_no_stopwords'] = video_df['cleaned_title'].apply(lambda x: [item for item in str(x).split() if item.lower() not in stop_words])


all_words = [a for b in video_df['title_no_stopwords'].tolist() for a in b]
all_words_str = ' '.join(all_words) 

def plot_cloud(wordcloud):
    plt.figure(figsize=(30, 20))
    plt.imshow(wordcloud, interpolation='bilinear')  
    plt.axis("off")
    plt.show() 

# Generate the word cloud
wordcloud = WordCloud(width=2000, height=1000, random_state=1, background_color='white',
                     colormap='gist_earth', collocations=False).generate(all_words_str)

# Plot the word cloud
plot_cloud(wordcloud)

## Upload Schedule

In [None]:
#Creating upload schedule chart

video_df['publishedAt'] = pd.to_datetime(video_df['publishedAt'])
day_counts = video_df['publishedAt'].dt.day_name().value_counts()
weekdays = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
day_df = day_counts.reindex(weekdays, fill_value=0)
ax = day_df.reset_index().plot.bar(x='index', y='publishedAt', legend=False, rot=0)
ax.set_xlabel("Day of the Week")
ax.set_ylabel("Number of Videos Published")
ax.set_title("Videos Published by Day of the Week")
plt.show()