### Web Scraping Youtube

In [1]:
import requests
import re
import pandas as pd
import numpy as np
import random
import operator
from functools import reduce
from fuzzywuzzy import process, fuzz
import pickle
import sys
sys.path.append("../")

import src.support_youtube as sp


from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException


from time import sleep



In [5]:
df_meta = pd.read_csv("../data/csv/all_games_unique.csv")
df_meta.head(1)

Unnamed: 0.1,Unnamed: 0,name,meta_score,user_review,release_date
0,0,#DRIVE,69.0,6.990846,2021-02-16


In [8]:
df_twitch = pd.read_csv("../data/twitch/twitch.csv")
df_twitch.head(1)

Unnamed: 0.1,Unnamed: 0,game,watch_time_(hours),stream_time_(hours),peak_viewers,peak_channels,streamers,average_viewers,average_channels,average_viewer_ratio,fecha
0,0,League of Legends,88389049.0,1217250.0,553165,2945,113251,122933,1692,72.61,2016-04-01


In [10]:
# We choose one month from twitch dataset to web scrape Youtube. Due to the limitation of queries, we'll
# study only 10 well-known games.
df_games = pd.read_csv("../data/twitch/twitch_raw_data/Most watched games on Twitch 2020 may - SullyGnome.csv")
games = list(df_games.Game.unique()[1:12])
df_study = pd.DataFrame(games)
df_study.rename({0: "name"}, axis = 1, inplace=True)

# We create a new dataframe to only have the release_date for each game. 
df_dates = df_study.merge(df_meta,how="inner", on="name")
df_dates.drop(["Unnamed: 0", "meta_score", "user_review"], axis=1,inplace=True)
df_dates.drop_duplicates(subset=["name"], inplace=True)
df_dates.reset_index(inplace=True)
df_dates.drop("index",axis=1, inplace=True)
df_dates

Unnamed: 0,name,release_date
0,League of Legends,2009-10-27
1,Fortnite,2017-07-25
2,Grand Theft Auto V,2013-09-17
3,Call of Duty: Warzone,2020-03-10
4,Counter-Strike: Global Offensive,2012-08-21
5,Dota 2,2013-07-09
6,Minecraft,2011-11-18
7,FIFA 20,2019-09-24
8,World of Warcraft,2004-11-23
9,Apex Legends,2019-02-04


In [12]:
# We create a pickle file for this top games we will use later in the analysis.

with open ('../data/top10_games.pickle', "wb") as game:
    pickle.dump(games, game)

In [225]:
# We create a dict so each game will have a range of dates between their release_date (or a minimum of 2012) and 
# year 2022
dicc = {}
for i in range(df_dates.shape[0]):
    dicc[df_dates.loc[i].tolist()[0]] = pd.date_range(max(df_dates.loc[i].tolist()[1], '2012-01-01'),'2022-12-31', freq='MS').strftime("%Y-%m-%d").tolist()

In [226]:
# Now, we are going to create the query for Youtube. Thanks to advanced query formulas for Youtube, we can
# manipulate the query without the need of using the Youtube API.
# For the name of the game we are going to include in quotes "game" and "game gameplay"
# We are also going to look for videos that incluide "game" or "game gameplay" in the title of the video
# By last, we are going to include the date we created before in the dictionary, so each query will give us
# youtube videos that contain the 2 first things we asked for in the period of one month, starting in the
# release_date (or 2012) and ending in 2022-12-31.

research = []
for juego in dicc.keys():
    i = 1
    while (i<len(dicc[juego])):
        fecha_inicial = dicc[juego][i-1]
        fecha_final = dicc[juego][i]
        research.append(f'("{juego}" OR "{juego} gameplay" OR intitle:"{juego}" OR intitle:"{juego} gameplay") after:{fecha_inicial} before:{fecha_final}')
        i+=1
        if fecha_final == '2022-12-01':
            fecha_inicial = '2022-12-01'
            fecha_final = '2022-12-31'
            research.append(f'("{juego}" OR "{juego} gameplay" OR intitle:"{juego}" OR intitle:"{juego} gameplay") after:{fecha_inicial} before:{fecha_final}')
            i+=1

In [227]:
# For this list of Youtube queries, we have to make some changes so the query looks like exactly as the 
# url we have to insert in Youtube 

research_youtube = []
for busqueda in research:
    research_youtube.append(busqueda.replace(" ","+").replace(":","%3A"))
len(research_youtube)

927

In [276]:
# Now we are able to use these urls to web scrape each youtube url finding the 100 youtube videos most viewed
# with the game name or the word "gameplay" within the video description or the title for each month of the year.

# This lines will search the youtube video ids in the html code of the web page.
# We will use them later when calling the Youtube API.

options = webdriver.ChromeOptions()

options.add_experimental_option("detach", True)
options.add_argument("--disable-extensions")
options.add_argument("--disable-notifications")
options.add_argument("--disable-Advertisement")
options.add_argument("--disable-popup-blocking")
options.add_argument("start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--no-sandbox")
options.add_argument("--log-level=3")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--no-default-browser-check")
options.add_argument("--no-first-run")
options.add_argument("--no-proxy-server")
options.add_argument("--disable-blink-features=AutomationControlled")
exp_opt = [
    'enable-automation'
]

urls = []
for i in range(len(research_youtube)):
    try:
        driver = webdriver.Chrome(executable_path='chromedriver')
        url=f'https://www.youtube.com/results?search_query={research_youtube[i]}'
        driver.get(url)
        sleep(random.randint(1,3))
        driver.find_element("css selector", "#content > div.body.style-scope.ytd-consent-bump-v2-lightbox > div.eom-buttons.style-scope.ytd-consent-bump-v2-lightbox > div:nth-child(1) > ytd-button-renderer:nth-child(2) > yt-button-shape > button > yt-touch-feedback-shape > div > div.yt-spec-touch-feedback-shape__fill").click()
        sleep(random.randint(1,3))
        driver.find_element("css selector", "#container > ytd-toggle-button-renderer > yt-button-shape > button > yt-touch-feedback-shape > div > div.yt-spec-touch-feedback-shape__fill").click()
        sleep(random.randint(1,3))
        driver.find_element("xpath", '//*[@title="Ordenar por recuento de visualizaciones"]/yt-formatted-string').click()
        sleep(random.randint(1,3))
        driver.execute_script("window.scrollTo(0, 2000000);")
        sleep(random.randint(1,3))
        driver.execute_script("window.scrollTo(0, 2000000);")
        sleep(random.randint(1,3))
        driver.execute_script("window.scrollTo(0, 2000000);")
        sleep(random.randint(1,3))
        driver.execute_script("window.scrollTo(0, 2000000);")
        sleep(random.randint(1,3))
        data = driver.find_elements(By.ID, "video-title")

        for dato in data:
            if dato.get_attribute("aria-label") == None:
                pass
            else:
                urls.append(dato.get_attribute("aria-label"))
        driver.close()

    except NoSuchElementException:
        research_youtube.append(research_youtube[i])
        driver.close()
    except ElementClickInterceptedException:
        research_youtube.append(research_youtube[i])
        driver.close()


In [494]:
len(urls)

84421

In [None]:
df = pd.DataFrame(urls)

In [None]:
df.to_csv("enlaces_youtube")

### API Youtube

Now that we have all the urls, we are going to clean this csv so we only keep the ID for each video, needed
for the queries in the Youtube API.

In [3]:
df = pd.read_csv("../data/youtube/enlaces_youtube")
df.drop("Unnamed: 0", axis=1, inplace=True)
df.columns = ["enlace"]
df.head(2)

Unnamed: 0,enlace
0,https://www.youtube.com/watch?v=B18qApcVCRg
1,https://www.youtube.com/watch?v=kNabvLyUXzI


In [4]:
df.value_counts().reset_index().head()

Unnamed: 0,enlace,0
0,cambio de juego,858
1,"no hay data para: (""Grand+Theft+Auto+V""+OR+""Gr...",2
2,https://www.youtube.com/watch?v=cxGXf7rQGeo,2
3,"no hay data para: (""FIFA+20""+OR+""FIFA+20+gamep...",2
4,"no hay data para: (""Dota+2""+OR+""Dota+2+gamepla...",2


In [5]:
# We get rid of the rows we don't need and we create a new df for short videos because their url is different
# from usual videos.

lista_shorts = []
for i in range(df.shape[0]):
    if df.loc[i][0] == "cambio de juego":
        df.drop(i, inplace=True)
    elif df.loc[i][0].startswith("no hay"):
        df.drop(i, inplace=True)
    elif "shorts" in df.loc[i][0]:
        lista_shorts.append(df.loc[i][0])
        df.drop(i, inplace=True)

len(lista_shorts)

1615

In [6]:
df.value_counts().reset_index().head(3)

Unnamed: 0,enlace,0
0,https://www.youtube.com/watch?v=xXKsaWeQt3c,2
1,https://www.youtube.com/watch?v=kA0Ywk7sQLg,2
2,https://www.youtube.com/watch?v=K2sOCo7dWqo,2


In [7]:
# We clean the urls from short videos

dff = pd.DataFrame(lista_shorts)
dff.columns = ["enlace"]
dff["enlace"] = dff["enlace"].apply(lambda x: x.strip("https://www.youtube.com/shorts/"))
dff.head(2)

Unnamed: 0,enlace
0,aGM_9wvWbUQ
1,Uwglp7WtHyM


In [8]:
df_final = pd.concat([df,dff], axis=0, ignore_index=True)
df_final.shape

(90699, 1)

In [9]:
# Now we can clean the other videos
df_final["enlace"] = df_final["enlace"].apply(lambda x: x.lstrip("https://www.youtube.com/watch?v="))
df_final.head(3)

Unnamed: 0,enlace
0,B18qApcVCRg
1,kNabvLyUXzI
2,No2MzRYs1-8


In [10]:
df_final.to_csv("codigos_YT")

In [11]:
df_final = pd.read_csv("../data/youtube/api/codigos_YT")

In [15]:
# During the web scraping, we haven't associated each url to their correspondant videogame. That's why we need 
# to manually group the different IDs.

df_lol_1 = df_final.loc[0:10001]
df_lol_2 = df_final.loc[10001:11557]
df_fortnite = df_final.loc[11557:17233]
df_gta = df_final.loc[17233:27356]
df_callofduty = df_final.loc[27356:30197]
df_csgo = df_final.loc[30197:41713]
df_csgo_2 = df_final.loc[40156:41713]

list_fortnite = df_fortnite["enlace"].to_list()
list_lol_1 = df_lol_1["enlace"].to_list()
list_lol_2 = df_lol_2["enlace"].to_list()
list_callofduty = df_callofduty["enlace"].to_list()
list_gta = df_gta["enlace"].to_list()
list_csgo = df_csgo["enlace"].to_list()
list_csgo_2 = df_csgo_2["enlace"].to_list()

In [6]:
# Now we can call the Youtube API with these IDs. For each day we have 10.000 queries for free. 

import os
from dotenv import load_dotenv

load_dotenv()

True

In [17]:
sp.youtube(list_csgo_2, "Counter-Strike: Global Offensive", "csgo_2")

403
Something went wrong with the ID 86pVtqn_kk
403
Something went wrong with the ID Ne8qtN5S4mI
403
Something went wrong with the ID newLJydMww
403
Something went wrong with the ID Yht2ooedJo
403
Something went wrong with the ID O74loZNkLuU
403
Something went wrong with the ID 9z3NZ-RV2VE
403
Something went wrong with the ID PdMh349TDxk
403
Something went wrong with the ID TzRrFCsVjA
403
Something went wrong with the ID k-7NGyOX8
403
Something went wrong with the ID QpCT1GVk4
403
Something went wrong with the ID rEKT5oUVM6g
403
Something went wrong with the ID A5xis-CImys
403
Something went wrong with the ID KDulGs4OoiA&pp=ugMICgJlcxABGAE%3D
403
Something went wrong with the ID E9Ac9GmJTls&pp=ugMICgJlcxABGAE%3D
403
Something went wrong with the ID GPZeMjM62k0
403
Something went wrong with the ID 9D5z2A-130o&pp=ugMICgJlcxABGAE%3D


KeyboardInterrupt: 

### Cleaning YT

In [29]:
sp.cleaning("../data/youtube/api/csgo_raw", "csgo")