# Connect Drive, Import Modules, and Load GitHub

In [1]:
from google.colab import drive
root_dir = '/content/drive'
drive.mount(root_dir)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
%load_ext autoreload
import sys
from os.path import join
base_dir = join(root_dir,'My Drive/GitHub')
project_dir = join(base_dir,'youtube_dataset_creator')
sys.path.append(base_dir)
sys.path.append(project_dir)
from manage_github_colab import ManageGitHub

In [3]:
m = ManageGitHub.ManageGitHub('youtube_dataset_creator','GitHub')

# Settings

In [5]:
MIN_VIDEOS = 2
SEGMENT_WORD_COUNT = 1400

# Combine Datasets

In [9]:
import pandas as pd
import glob

In [13]:
files = glob.glob(join(project_dir,'data/*_transcripts.pkl'))
df = pd.DataFrame()
for transcriptFile in files:
  tf = pd.read_pickle(transcriptFile)
  tf = tf.loc[[len(transcript) > 0 for transcript in tf.transcript]]
  if len(tf) >= MIN_VIDEOS:
    df = df.append(tf, ignore_index=True)
df['name'] = [name.lower().replace(' ','_') for name in df['name']]
df = df.loc[:, ~df.columns.str.match('Unnamed')]

In [14]:
df.head()

Unnamed: 0,name,keyword,youtube_id,title,description,channel,has_name,is_conversation,transcript
0,jack_nicholson,interview,lTgcasf4X1k,Jack Nicholson Interviewed By Elliot Mintz,Elliot Mintz interviews Jack Nicholson and lea...,Elliot Mintz,True,True,"[{'text': 'you', 'start': 11.73, 'duration': 2..."
1,jack_nicholson,interview,2aR17O5Wr-o,Jack Nicholson In Conversation With Elliot Mintz,Jack Nicholson interviewed by Elliot Mintz in ...,Elliot Mintz,True,True,[{'text': 'so 10 minutes 12:15 okay I never ta...
2,jack_nicholson,interview,JwmgKgu3klE,Jack Nicholson | Dennis Hopper|Helen Mirren |U...,Episode 2 Jack Nicholson - Its hard to believe...,Biographies Around the World,True,True,"[{'text': '[Music]', 'start': 5.71, 'duration'..."
3,glenn_close,interview,W5FcMQwM8hI,2019 Golden Globe Best Actress Glenn Close tal...,Brian To: http://www.briantophoto.com/ FREE 90...,Brian To,True,True,[{'text': 'well hope Screen Actors Guild Award...
4,glenn_close,interview,SqzFwkJSpcA,Glenn Close at the 'It's OK to Not Be OK' foru...,Glenn Close and U.S. Sen. Debbie Stabenow talk...,Click On Detroit | Local 4 | WDIV,True,False,[{'text': 'welcome to our it's okay to not be ...


In [15]:
len(df.name.unique()), df.name.unique()

(43, array(['jack_nicholson', 'glenn_close', 'judi_dench', 'robert_de_niro',
        'leonardo_dicaprio', 'morgan_freeman', 'tom_hanks',
        'anthony_hopkins', 'samuel_l_jackson', 'frances_mcdormand',
        'helen_mirren', 'gary_oldman', 'al_pacino', 'sean_penn',
        'meryl_streep', 'denzel_washington', 'tom_cruise',
        'robert_downey_jr', 'scarlett_johansson', 'jennifer_lawrence',
        'will_smith', 'amy_adams', 'cate_blanchett', 'jessica_chastain',
        'bryan_cranston', 'matt_damon', 'ryan_gosling', 'jake_gyllenhaal',
        'nicole_kidman', 'brie_larson', 'julianne_moore', 'edward_norton',
        'natalie_portman', 'margot_robbie', 'saoirse_ronan',
        'kate_winslet', 'don_cheadle', 'viola_davis', 'laura_dern',
        'peter_dinklage', 'sarah_paulson', 'sam_rockwell', 'andy_serkis'],
       dtype=object))

# Clean and Segment Transcripts

In [16]:
import math
import re

def getSegmentedByN(transcript, N, obscure_capitals=True, delete_capitals=False):
  whitespace_list = ['\n']

  # create cleaned full list of (text,time) pairs from transcript JSON
  text_list = [(dictionary.get('text'),(math.floor(dictionary.get('start')))) for dictionary in transcript]
  replace_list = []
  for text,start in text_list:
    new_text = text
    for space_string in whitespace_list:
      new_text = text.replace(space_string,' ')
    lower_word_count = len(re.findall("[a-z]+",new_text))
    all_word_count = len(re.findall("[a-zA-Z']+",new_text))
    if lower_word_count < 0.5*all_word_count:
      new_text = new_text.casefold()
    simplified_text = " ".join(re.findall("[a-zA-Z']+", new_text))
    replace_list.append((simplified_text,start))
  pairs_list = [(word,start) for (text,start) in replace_list for word in text.split(' ')]

  # remove or obscure capitalized words
  clean_pairs_list = []
  for word, start in pairs_list:
    if obscure_capitals and re.search("[A-Z]", word) and word!='I' and ("I'" not in word):
      if not delete_capitals:
        clean_pairs_list.append(('xxcap',start))
    else:
      # clean_pairs_list.append((word.casefold(),start))
      clean_pairs_list.append((word,start))
  pairs_list = clean_pairs_list

  # create segments from cleaned text and times
  segment_list = []
  for segment_start in range(0,len(pairs_list),N):
    segment_end = segment_start + N
    segment_text = ' '.join([text for text,time in pairs_list[segment_start:segment_end]])
    min_time = min([time for text,time in pairs_list[segment_start:segment_end]])
    max_time = max([time for text,time in pairs_list[segment_start:segment_end]])
    segment_times = (min_time,max_time)
    if len(segment_text.split(' ')) >= N: 
      segment_list.append((segment_text, segment_times))

  return segment_list

In [17]:
dt = df.loc[:,~df.columns.str.match('transcript')].copy()
dt['youtube_id'] = [str(id) for id in dt.youtube_id]
ds = pd.DataFrame()
for youtube_id, transcript in zip(df.youtube_id, df.transcript):
  segments = getSegmentedByN(transcript, SEGMENT_WORD_COUNT)
  ds = ds.append(pd.DataFrame({'youtube_id':[str(youtube_id) for _ in segments],\
                               'text':[s[0] for s in segments],\
                               'times':[s[1] for s in segments] }))
  ds['start'] = [t[0] for t in ds.times]
  ds['end'] = [t[1] for t in ds.times]
  ds['link'] = ['https://youtu.be/'+youtube_id+'?t='+str(start) for youtube_id,start,end in zip(ds.youtube_id,ds.start,ds.end)]
  ds['embed'] = ['https://youtube.com/embed/'+youtube_id+'?start='+str(start)+'&end='+str(end) for youtube_id,start,end in zip(ds.youtube_id,ds.start,ds.end)]
dt = dt.merge(ds, on='youtube_id')

In [18]:
dt

Unnamed: 0,name,keyword,youtube_id,title,description,channel,has_name,is_conversation,text,times,start,end,link,embed
0,jack_nicholson,interview,lTgcasf4X1k,Jack Nicholson Interviewed By Elliot Mintz,Elliot Mintz interviews Jack Nicholson and lea...,Elliot Mintz,True,True,you you you you one of the nice things about x...,"(11, 540)",11,540,https://youtu.be/lTgcasf4X1k?t=11,https://youtube.com/embed/lTgcasf4X1k?start=11...
1,jack_nicholson,interview,lTgcasf4X1k,Jack Nicholson Interviewed By Elliot Mintz,Elliot Mintz interviews Jack Nicholson and lea...,Elliot Mintz,True,True,words if the writing has a basic quality all i...,"(540, 1124)",540,1124,https://youtu.be/lTgcasf4X1k?t=540,https://youtube.com/embed/lTgcasf4X1k?start=54...
2,jack_nicholson,interview,2aR17O5Wr-o,Jack Nicholson In Conversation With Elliot Mintz,Jack Nicholson interviewed by Elliot Mintz in ...,Elliot Mintz,True,True,so minutes okay I never taught you short no yo...,"(18, 500)",18,500,https://youtu.be/2aR17O5Wr-o?t=18,https://youtube.com/embed/2aR17O5Wr-o?start=18...
3,jack_nicholson,interview,2aR17O5Wr-o,Jack Nicholson In Conversation With Elliot Mintz,Jack Nicholson interviewed by Elliot Mintz in ...,Elliot Mintz,True,True,performer you know on that that made that make...,"(500, 967)",500,967,https://youtu.be/2aR17O5Wr-o?t=500,https://youtube.com/embed/2aR17O5Wr-o?start=50...
4,jack_nicholson,interview,2aR17O5Wr-o,Jack Nicholson In Conversation With Elliot Mintz,Jack Nicholson interviewed by Elliot Mintz in ...,Elliot Mintz,True,True,and once you're older the appeal of that image...,"(967, 1486)",967,1486,https://youtu.be/2aR17O5Wr-o?t=967,https://youtube.com/embed/2aR17O5Wr-o?start=96...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1719,andy_serkis,interview,qiU8_j32t40,Going Ape with Andy Serkis - BBC Click,Click meets performance capture legend Andy Se...,BBC Click,True,False,every single dent and ding that I've put in th...,"(583, 1192)",583,1192,https://youtu.be/qiU8_j32t40?t=583,https://youtube.com/embed/qiU8_j32t40?start=58...
1720,andy_serkis,interview,OlNQ9T_zOVs,Conversations with Andy Serkis of WAR FOR THE ...,Q&A with Andy Serkis. Moderated by Jenelle Ril...,SAG-AFTRA Foundation,True,True,hey good evening everyone my name is xxcap xxc...,"(3, 465)",3,465,https://youtu.be/OlNQ9T_zOVs?t=3,https://youtube.com/embed/OlNQ9T_zOVs?start=3&...
1721,andy_serkis,interview,OlNQ9T_zOVs,Conversations with Andy Serkis of WAR FOR THE ...,Q&A with Andy Serkis. Moderated by Jenelle Ril...,SAG-AFTRA Foundation,True,True,the notion of xxcap really was what was my sti...,"(465, 917)",465,917,https://youtu.be/OlNQ9T_zOVs?t=465,https://youtube.com/embed/OlNQ9T_zOVs?start=46...
1722,andy_serkis,interview,OlNQ9T_zOVs,Conversations with Andy Serkis of WAR FOR THE ...,Q&A with Andy Serkis. Moderated by Jenelle Ril...,SAG-AFTRA Foundation,True,True,you do not have a costume and makeup you don't...,"(917, 1340)",917,1340,https://youtu.be/OlNQ9T_zOVs?t=917,https://youtube.com/embed/OlNQ9T_zOVs?start=91...


# Export Dataset

In [None]:
export_name = '43celebs_'+str(SEGMENT_WORD_COUNT)+'len_dataset'

In [None]:
dt.to_pickle(project_dir+'/data/'+export_name+'.pkl')

# Update GitHub

In [None]:
m.save('Exported dataset')