In [None]:
!pip install --upgrade google-api-python-client

In [None]:
from googleapiclient.discovery import build
from dateutil import parser
import pandas as pd
from IPython.display import JSON

# Data viz packages
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud
api_service_name = "youtube"
api_version = "v3"

In [None]:
from dotenv import load_dotenv
import os

api_key = "ADD YOUR KEY"
# print(api_key)
channel_ids=["UCfLuT3JwLx8rvHjHfTymekw", 
             # add more
             ]

scopes = ["https://www.googleapis.com/auth/youtube.readonly"]

api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

In [None]:
import pandas as pd


def get_channel_stats(youtube, channel_ids):
    
    all_data = []
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()

    # loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
                'subscribers': item['statistics']['subscriberCount'],
                'views': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
        }
        
        all_data.append(data)
        
    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    
    video_ids = []
    
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlist_id,
        maxResults = 50
    )
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
                    part='contentDetails',
                    playlistId = playlist_id,
                    maxResults = 50,
                    pageToken = next_page_token)
        response = request.execute()

        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')
        
    return video_ids
    
    
def get_video_details(youtube, video_ids):

    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    
    return pd.DataFrame(all_video_info)
channel_stats = get_channel_stats(youtube, channel_ids)
channel_stats


In [None]:
playlist_id = "UUfLuT3JwLx8rvHjHfTymekw"

In [None]:
# Get video IDs
video_ids = get_video_ids(youtube, playlist_id)
len(video_ids)

In [None]:
# Get video details
video_df = get_video_details(youtube, video_ids)
video_df

In [None]:
# Check for NULL values
video_df.isnull().any()

In [None]:
# Check data types
video_df.dtypes

In [None]:
# Convert count columns to numeric
numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)
# Publish day in the week
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A")) 
# convert duration to seconds
# import isodate

# video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
# video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')
# video_df[['durationSecs', 'duration']] 

# Add tag count
video_df['tagCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))
video_df


In [None]:
ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=False)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:,.0f}'.format(x/1000) + 'K'))

In [None]:
ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=True)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:,.0f}'.format(x/1000) + 'K'))

In [None]:
sns.violinplot(x = video_df['channelTitle'], y = video_df['viewCount'])

In [None]:
fig, ax = plt.subplots(1,2)
sns.scatterplot(data = video_df, x = 'commentCount', y = 'viewCount', ax = ax[0])
sns.scatterplot(data = video_df, x = 'likeCount', y = 'viewCount', ax = ax[1])

In [None]:
stop_words = set(stopwords.words('english'))
video_df['title_no_stopwords'] = video_df['title'].apply(lambda x: [item for item in str(x).split() if item not in stop_words])

all_words = list([a for b in video_df['title_no_stopwords'].tolist() for a in b])
all_words_str = ' '.join(all_words) 

def plot_cloud(wordcloud):
    plt.figure(figsize=(30, 20))
    plt.imshow(wordcloud) 
    plt.axis("off")

wordcloud = WordCloud(width = 2000, height = 1000, random_state=1, background_color='black', 
                      colormap='viridis', collocations=False).generate(all_words_str)
plot_cloud(wordcloud)

In [None]:
day_df = pd.DataFrame(video_df['pushblishDayName'].value_counts())
weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_df = day_df.reindex(weekdays)
ax = day_df.reset_index().plot.bar(x='index', y='pushblishDayName', rot=0)

## Comment Analysis


In [None]:
# 8F801HMrdp0
import requests
video_id = "8F801HMrdp0"

In [None]:

video_info_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}"
video_info_response = requests.get(video_info_url)
video_info_data = video_info_response.json()

In [None]:
video_info_data

In [None]:
comment_url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&id={video_id}&key={api_key}"
comment_response = requests.get(comment_url)
comment_data = comment_response.json()

In [None]:
comment_data

In [None]:
comments = [item["snippet"]["topLevelComment"]["snippet"]["textOriginal"] for item in comment_data["items"]]

In [None]:
print(comments)

In [None]:
!pip install textblob

In [None]:
from textblob import TextBlob
def get_comment_sentiment(comment):
    analysis = TextBlob(comment)
    if(analysis.sentiment.polarity)>0:
        return "  + positive + "
    elif (analysis.sentiment.polarity)>0:
        return " neutral :| "
    else:
        return "  - negative - "

In [None]:
for comment in comments:
    print(f"{comment}: {get_comment_sentiment(comment)}")

In [None]:
comment_text = [] 
comment_label = []
for comment in comments:
    comment_text.append(comment)
    print(comment)
    t = get_comment_sentiment(comment)
    print(t)
    comment_label.append(t)

In [None]:
pd.DataFrame({'comments': comment_text, 'sentiment': comment_label})

In [None]:
df = pd.DataFrame({'comments': comment_text, 'sentiment': comment_label})
df.to_csv("youtube.csv")