<h1 align="center">Youtube data construction</h1>
<h2 align=center> What did I watch on youtube ? </h2>
<br>
<center>The purpose of this script is to create differents dataframe containing informations about my youtube watch history.</center>
<center>A second notebook will be used to analyse this data.</center>

The purpose here is to : 
* fetch and clean the download my-youtube-watch-history received from google service, 
* web scraping of video details and channel infos from the Youtube API. 

In [None]:
import pandas as pd
from googleapiclient.discovery import build
from datetime import time

<h2 align="center">Raw Data</h2>

In [None]:
#file path
raw_data = pd.read_json('***/watch-history.json')

In [None]:
raw_data.to_excel('raw_data.xlsx')

<h2 align="center">Cleaning of data</h2>

<h3> columns cleaning </h3>

In [None]:
wt = raw_data.copy()
wt = wt.drop(['header','products','activityControls','description'],axis=1)

In [None]:
wt.head(3)

<h3> columns organization </h3>

In [None]:
#Cleaning Title
wt['title'] = wt['title'].str.replace('Vous avez regardé ','')

#Fetch video-ID from the url and place it on a different columns
wt['video_ID'] = wt['titleUrl'].str.split('=').str[1]

#Fetch the date and time in different columns and set them to datetime type
wt['time'] = wt['time'].str.replace('T',' ').str.split('.').str[0]
wt['time'] = wt['time'].str.replace('Z','')
wt['time'] = pd.to_datetime(wt['time'])

<h3>Filter the advertisement in a different dataframe</h3>

* Dataframe tableau_pub : only Google advertisement
* Dataframe working_table (wt) : extract the advertisement

In [None]:
tableau_pub = wt.copy()
tableau_pub = wt[wt['subtitles'].isnull()&wt['details'].notnull()]
tableau_pub = tableau_pub.drop(['video_ID'],axis=1)
tableau_pub.head(3)

In [None]:
wt = wt[wt['subtitles'].notnull()]
wt = wt.drop(['details'],axis=1)
wt.head()

<h3> Fetch data from the subtitles columns </h3>

In [None]:
author = []
channel_url = []
for element in wt['subtitles']:
    author.append(element[0]["name"])
    channel_url.append(element[0]["url"])
wt['creator']=author
wt['channel_url']=channel_url

In [None]:
wt.head(3)

<h3> Index reset </h3>

In [None]:
wt.index=range(len(wt))
wt = wt.drop(['subtitles'],axis=1)
wt.head(3)

<h3>Fetch channel_id from the channel_url </h3>

In [None]:
wt['channel_id']= wt['channel_url'].str.replace('https://www.youtube.com/channel/','')
wt.head(3)

<h2 align=center> Recuperation channels data from Youtube API </h2>

* identification channels where I watched more than 10 videos
* convert there channel_id to a list
* pass the list to the youtube API and scrap their data

In [None]:
count = wt.channel_id.value_counts()
count = count[count > 10]
len(count)

In [None]:
channel_scraping = pd.DataFrame(data=count)
channel_scraping = channel_scraping.reset_index()

In [None]:
#put your api key '****'
api_key ='****'
youtube= build('youtube', 'v3', developerKey=api_key)

In [None]:
channel_ids = channel_scraping['channel_id'].tolist()

In [None]:
def get_channel_stats(youtube, channel_ids):
    all_data = []
    
    for i in range(0,len(channel_ids),50):                  
        request = youtube.channels().list(
            part="snippet,contentDetails,statistics,id",
            id = ','.join(channel_ids[i:i+50]))
        response = request.execute()

        for i in range(len(response['items'])):
            data =dict(Channel_name = response['items'][i]['snippet']['title'],
                       channel_id = response['items'][i]['id'], 
                       Channel_date = response['items'][i]['snippet']['publishedAt'],
                       Subscribers = response['items'][i]['statistics']['subscriberCount'],
                       Views = response['items'][i]['statistics']['viewCount'],
                       country = response['items'][i]['snippet'].get('country'),
                       Total_videos = response['items'][i]['statistics']['videoCount'])        
            all_data.append(data)
    
    return all_data

In [None]:
channel_statitics = get_channel_stats(youtube, channel_ids)
channel_data = pd.DataFrame(channel_statitics)

In [None]:
data_channel = pd.merge(channel_data, channel_scraping, how='inner',on='channel_id')
data_channel = data_channel.sort_values('count',ascending=False)
data_channel

In [None]:
# cleanning datatype : numeric
numeric_cols = ['Subscribers','Views','Total_videos']
data_channel[numeric_cols] = data_channel[numeric_cols].apply(pd.to_numeric, errors='coerce')

#cleaning datatype: datetime
data_channel['Channel_date'] = data_channel['Channel_date'].str.replace('T',' ').str.split('.').str[0]
data_channel['Channel_date'] = data_channel['Channel_date'].str.replace('Z','')
data_channel['Channel_date'] = pd.to_datetime(data_channel['Channel_date'])

#result
data_channel.dtypes

In [None]:
data_channel.head(3)

<h2 align = center> Recuperation datas from all my videos history </h2>

In [None]:
count_video = wt.video_ID.value_counts()
len(count_video)

In [None]:
video_scraping = pd.DataFrame(data=count_video)
video_scraping = video_scraping.reset_index()
video_scraping.head(3)

In [None]:
video_ids = video_scraping['video_ID'].tolist()

In [None]:
def get_video_details(youtube, video_ids):
    all_videos_stats = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part = 'snippet, statistics',
            id=','.join(video_ids[i:i+50]))
        response = request.execute()

        for video in response['items']:
            video_stats = dict(title = video['snippet']['title'],
                                Published_date = video['snippet']['publishedAt'],
                                Views = video['statistics'].get('viewCount'),
                                Like = video['statistics'].get('likeCount'),
                                Comments = video['statistics'].get('commentCount'),
                                Videocategory = video['snippet']['categoryId']
                              )
            all_videos_stats.append(video_stats)

    return all_videos_stats

In [None]:
video_statitics = get_video_details(youtube, video_ids)
video_data = pd.DataFrame(video_statitics)
video_data.head(3)

In [None]:
data_video = pd.merge(video_data, wt, how='inner',on='title')
data_video.info()

In [None]:
# cleanning datatype : numeric
numeric_cols = ['Views','Like','Comments']
data_video[numeric_cols] = data_video[numeric_cols].apply(pd.to_numeric, errors='coerce')

#cleaning datatype: datetime
data_video['Published_date'] = data_video['Published_date'].str.split('T').str[0]
data_video['Published_date'] = pd.to_datetime(data_video['Published_date'])

#result
data_video.dtypes

In [None]:
data_video.Videocategory.value_counts()

In [None]:
my_categories = {'10':'Music','24':'Entertainment','22':'People&Blog','23':'Comedy','20':'Gaming','27':'Education',
                 '1':'Film&Animation','26':'HowTo&Style','28':'Science&Technology','25':'News&Politics','17':'Sports',
                '19':'Travel&Events','15':'Pets&Animal','2':'Autos','29':'NonProfits&Activism','44':'Trailers'}

In [None]:
data_video['Videocategory'] = data_video['Videocategory'].map(my_categories)

<h2 align='center'> Save Data to Excel </h2>

In [None]:
with pd.ExcelWriter ("youtube_data.xlsx") as writer :
    wt.to_excel(writer, sheet_name='working_data',index=False)
    tableau_pub.to_excel(writer, sheet_name='pub_data', index=False)
    data_channel.to_excel(writer, sheet_name='channel_data',index=False)
    data_video.to_excel(writer,sheet_name='video_data',index=False)