Author: CBell, KMonzella <br>
Date: 2021/10/30 <br>
Subject: Pull YouTube data <br>
Overview: In our 01 program, we output a combined list of YouTube links from which we need to pull data. This program accesses those links, scrapes the data, and adds the information to a single output file. It also runs very preliminary data cleaning that could not be done easily in Spark in future programs.

In [1]:
# --- program set-up
import os
import glob
import spacy
import langdetect
import langid
import shutil
import youtube_dl
import youtube_transcript_api
import pandas                            as pd
from   pytube                            import YouTube
from   tqdm                              import tqdm
from   time                              import sleep
from   youtube_transcript_api            import YouTubeTranscriptApi
from   youtube_transcript_api.formatters import TextFormatter
from   spacy_cld                         import LanguageDetector
from   spacy.language                    import Language

# --- pathway to output
default_directory = "/Users/kelleymonzella1/Desktop/Graduate_school_work/Big_Data/Final_project/"


# --- define functions to be used
# ----- saving out table of data
def make_table_data_filepath():
    return os.path.join(default_directory, 'Data', 'output_youtube_data.csv')

def save(data):
    fp = make_table_data_filepath()
    df = pd.DataFrame([data])
    if not os.path.exists(os.path.dirname(fp)):
        os.makedirs(os.path.dirname(fp))
    df.to_csv(fp, mode='a', header=not os.path.exists( fp ), index=False )

    
def make_error_data_filepath():
    return os.path.join(default_directory, 'Data', 'output_youtube_errors.csv')

def saveerr(data):
    fp = make_error_data_filepath()
    df = pd.DataFrame([data])
    if not os.path.exists(os.path.dirname(fp)):
        os.makedirs(os.path.dirname(fp))
    df.to_csv(fp, mode='a', header=not os.path.exists( fp ), index=False )
    
# ----- saving out videos     
def make_filepath(video_path):
    return os.path.join(default_directory, 'videos', os.path.basename(video_path))

    
# ----- downloading the data    
def download(url):
    
    if not isinstance(url, list):
        url = [url]
        
    for i in tqdm(range(len(url)), desc = 'tqdm() Progress Bar'):   
        u = url[i]
        
        video = YouTube(u)
        try:
            video_id   = u.replace('https://www.youtube.com/watch?v=', '')
            try:
                t = YouTubeTranscriptApi.get_transcript(video_id)
                # format transcript as text
                formatter  = TextFormatter()
                transcript = formatter.format_transcript(t).replace("\n", " ")

            except:
                transcript = "No transcript available"
            data = {
                'author': video.author, 
                'channel': video.channel_id,
                'description': video.description,
                'publish_date': int(video.publish_date.strftime('%Y%m%d')),
                'watch_url': video.watch_url,
                'keywords': '|'.join(video.keywords),
                'metadata': video.metadata,
                'stream_info': video.fmt_streams,
                'age_restricted': video.age_restricted,
                'length': video.length, 
                'rating': video.rating, 
                'title': video.title,
                'views': video.views, 
                'transcript': transcript,
                'thumbnail': video.thumbnail_url
            }
            
            save(data)

        except Exception as e:
            saveerr(e)
            


In [None]:
# --- Pulling the data for URLs

# ----- Get all URLS

def main():
    with open('/Users/kelleymonzella1/Desktop/Graduate_school_work/Big_Data/Final_project/Data/youtube_urls_combined_20211120.txt', mode='r') as f:
        urls = f.readlines()[3:]
        
    urls = [i.strip() for i in urls]
    print(urls)
    download(urls)
    
main()


tqdm() Progress Bar:   0%|          | 0/28172 [00:00<?, ?it/s]

['https://www.youtube.com/watch?v=O5lyArHqD9Q', 'https://www.youtube.com/watch?v=Lrj2JRdVsJA', 'https://www.youtube.com/watch?v=L1PaRE7av4A', 'https://www.youtube.com/watch?v=VsaDRo-FoMo', 'https://www.youtube.com/watch?v=sXN8UbrU3_Q', 'https://www.youtube.com/watch?v=R0BNFAZ53QU', 'https://www.youtube.com/watch?v=unhaP-81_Xs', 'https://www.youtube.com/watch?v=0actihzDbBU', 'https://www.youtube.com/watch?v=yOqKgco4tms', 'https://www.youtube.com/watch?v=W8282TH2rUs', 'https://www.youtube.com/watch?v=RPpZ4lvhY44', 'https://www.youtube.com/watch?v=pKWqUIW4obU', 'https://www.youtube.com/watch?v=YKQn1YCOxTo', 'https://www.youtube.com/watch?v=z4p1rDZMMRo', 'https://www.youtube.com/watch?v=aJGUKQbZQwM', 'https://www.youtube.com/watch?v=rILgD8wj-Ho', 'https://www.youtube.com/watch?v=RMtL24S9Gpc', 'https://www.youtube.com/watch?v=9_738JYwpqo', 'https://www.youtube.com/watch?v=MkTHKcX38Sk', 'https://www.youtube.com/watch?v=nOLLu3aUBPs', 'https://www.youtube.com/watch?v=c-Zlc6wWXwA', 'https://www

tqdm() Progress Bar:  89%|████████▊ | 24943/28172 [23:38:15<3:11:01,  3.55s/it] 

Note that that step above was run multiple times on a number of machines given long runtimes.

In [40]:
# --- Combine CSV files
os.chdir("/Users/kelleymonzella1/Desktop/Graduate_school_work/Big_Data/Final_project/data")

# --- name all CSVs
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

print(all_filenames)

# --- read in
df = pd.concat([pd.read_csv(f, \
                            lineterminator='\n', \
                            dtype=str, \
                            error_bad_lines=False, \
                            usecols = [*range(0,14)]) for f in all_filenames ])



['output_youtube_data_CB.csv', 'output_youtube_data_2.csv', 'output_youtube_data_3.csv', 'output_youtube_data_1.csv', 'output_youtube_data_4.csv', 'output_youtube_data_JB.csv']


In [41]:
# reset index 
print(df.shape)
df = df.reset_index(drop=True)

(411829, 14)


In [42]:
print(df.dtypes)
print(df.head())

author            object
channel           object
description       object
publish_date      object
watch_url         object
keywords          object
metadata          object
stream_info       object
age_restricted    object
length            object
rating            object
title             object
views             object
transcript        object
dtype: object
                    author                   channel  \
0     What Does That Mean?  UCS3RXL1ICt42KV-EeXSt--A   
1     Caacrinolaas - Topic  UCehqr_BtweZ9Al7aGJt5uGA   
2              TES - Randy  UCW9mpfvO5t3KY0vDn8c9yGg   
3  Triton & Jashin - Topic  UCyxoYF-8OUWnn4oZcBsPt6g   
4  The Heritage Foundation  UC5bEfSFTYQVfLCwkhBt8NtQ   

                                         description publish_date  \
0  What does proletarianism mean?\r\nA spoken def...     20150105   
1  Provided to YouTube by Believe SAS\n\nProletar...     20180609   
2  At least we still have Poland :)\n\nCheck out ...     20210105   
3  Provided to YouTube 

In [43]:
# identify video language - note that I was having trouble integrating spacy with pyspark, which is why this is implemented at this stage
languages_langdetect = []

desc = df["title"]

print(desc[0:5])

for line in desc:
    try:
        result = langdetect.detect_langs(line)
        result = str(result[0])[:2]
    except:
        result = 'unknown'
    
    finally:
        languages_langdetect.append(result)
        

0                  What does proletarianism mean?
1                                  Proletarianism
2        Provisional Problems with Proletarianism
3                                  Proletarianism
4    America's Biggest Issues: Election Integrity
Name: title, dtype: object


In [44]:
df['language_detected'] = languages_langdetect
df[["description", "language_detected"]].tail()

Unnamed: 0,description,language_detected
411824,Meer Mario-games bekijken: https://www.nintend...,nl
411825,Entdeckt mehr Mario-Spiele: https://www.ninten...,en
411826,Vedi altri giochi di Mario: https://www.ninten...,it
411827,Sono passati 35 anni dall'uscita del primo gio...,it
411828,「ニャニャニャ! ネコマリオタイム」は、ネコマリオとネコピーチがいろんなゲームのいろんな情報...,ja


In [45]:
df.to_csv('youtube_raw_20211120.csv')