# Connect Drive, Import Modules, and Load GitHub

In [1]:
from google.colab import drive
root_dir = '/content/drive'
drive.mount(root_dir)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
%load_ext autoreload
import sys
from os.path import join
base_dir = join(root_dir,'My Drive/GitHub')
project_dir = join(base_dir,'youtube_dataset_creator')
sys.path.append(base_dir)
sys.path.append(project_dir)
from manage_github_colab import ManageGitHub

In [4]:
m = ManageGitHub.ManageGitHub('youtube_dataset_creator','GitHub')

# Settings

In [153]:
MIN_VIDEOS = 3
SEGMENT_WORD_COUNT = 200

# Combine Datasets

In [8]:
import pandas as pd
import glob

In [119]:
files = glob.glob(join(project_dir,'data/*_transcripts.pkl'))
df = pd.DataFrame()
for transcriptFile in files:
  tf = pd.read_pickle(transcriptFile)
  tf = tf.loc[[len(transcript) > 0 for transcript in tf.transcript]]
  if len(tf) >= MIN_VIDEOS:
    df = df.append(tf, ignore_index=True)
df['name'] = [name.lower().replace(' ','_') for name in df['name']]
df = df.loc[:, ~df.columns.str.match('Unnamed')]

In [120]:
df.head()

Unnamed: 0,name,keyword,youtube_id,title,description,channel,has_name,is_conversation,transcript
0,ian_goodfellow,interview,Z6rxFNMGdn0,Ian Goodfellow: Generative Adversarial Network...,Ian Goodfellow is an author of the popular tex...,Lex Fridman,True,True,[{'text': 'the following is a conversation wit...
1,ian_goodfellow,interview,p7lx9-zRIME,Interview with Ian Mendes,I had a great time chatting with Ian about Man...,Josh Goodfellow,True,True,[{'text': 'have you been able to incorporate i...
2,ian_goodfellow,interview,sgHdUYHGvtA,Tutorial on Generative adversarial networks - ...,ICCV17 | Tutorials | Generative adversarial ne...,ComputerVisionFoundation Videos,True,False,[{'text': 'good morning everyone please go ahe...
3,ian_goodfellow,interview,YfJuujwKNOg,Adversarial Machine Learning Ian Goodfellow,Google's Ian Goodfellow joined us to share his...,South Park Commons,True,False,[{'text': 'I think this would be really inform...
4,ian_goodfellow,interview,RvgYvHyT15E,NIPS 2016 Workshop on Adversarial Training - I...,https://sites.google.com/site/nips2016adversar...,David Lopez-Paz,True,False,[{'text': 'thank you all for coming this is a'...


In [121]:
len(df.name.unique()), df.name.unique()

(43, array(['ian_goodfellow', 'jack_nicholson', 'glenn_close', 'judi_dench',
        'robert_de_niro', 'leonardo_dicaprio', 'morgan_freeman',
        'tom_hanks', 'anthony_hopkins', 'samuel_l_jackson', 'helen_mirren',
        'gary_oldman', 'al_pacino', 'sean_penn', 'meryl_streep',
        'denzel_washington', 'tom_cruise', 'robert_downey_jr',
        'scarlett_johansson', 'jennifer_lawrence', 'will_smith',
        'amy_adams', 'cate_blanchett', 'jessica_chastain',
        'bryan_cranston', 'matt_damon', 'ryan_gosling', 'jake_gyllenhaal',
        'nicole_kidman', 'brie_larson', 'julianne_moore', 'edward_norton',
        'natalie_portman', 'margot_robbie', 'saoirse_ronan',
        'kate_winslet', 'don_cheadle', 'viola_davis', 'laura_dern',
        'peter_dinklage', 'sarah_paulson', 'sam_rockwell', 'andy_serkis'],
       dtype=object))

# Clean and Segment Texts

In [122]:
import math
import re

def getSegmentedByN(transcript, N):
  whitespace_list = ['\n']

  # create cleaned full list of (text,time) pairs from transcript JSON
  text_list = [(dictionary.get('text'),(math.floor(dictionary.get('start')))) for dictionary in transcript]
  replace_list = []
  for text,start in text_list:
    new_text = text
    for space_string in whitespace_list:
      new_text = text.replace(space_string,' ')
    lower_word_count = len(re.findall("[a-z]+",new_text))
    all_word_count = len(re.findall("[a-zA-Z']+",new_text))
    if lower_word_count < 0.5*all_word_count:
      new_text = new_text.casefold()
    simplified_text = " ".join(re.findall("[a-zA-Z']+", new_text))
    replace_list.append((simplified_text,start))
  pairs_list = [(word,start) for (text,start) in replace_list for word in text.split(' ')]

  # create segments from cleaned text and times
  segment_list = []
  for segment_start in range(0,len(pairs_list),N):
    segment_end = segment_start + N
    segment_text = ' '.join([text for text,time in pairs_list[segment_start:segment_end]])
    min_time = min([time for text,time in pairs_list[segment_start:segment_end]])
    max_time = max([time for text,time in pairs_list[segment_start:segment_end]])
    segment_times = (min_time,max_time)
    if len(segment_text.split(' ')) >= N: 
      segment_list.append((segment_text, segment_times))

  return segment_list

In [170]:
dt = df.loc[:,~df.columns.str.match('transcript')].copy()
dt['youtube_id'] = [str(id) for id in dt.youtube_id]
ds = pd.DataFrame()
for youtube_id, transcript in zip(df.youtube_id, df.transcript):
  segments = getSegmentedByN(transcript, SEGMENT_WORD_COUNT)
  ds = ds.append(pd.DataFrame({'youtube_id':[str(youtube_id) for _ in segments],'text':[s[0] for s in segments],'start':[s[1][0] for s in segments],'end':[s[1][1] for s in segments]}))
dt = dt.merge(ds, on='youtube_id')

In [175]:
dt

Unnamed: 0,name,keyword,youtube_id,title,description,channel,has_name,is_conversation,text,start,end
0,ian_goodfellow,interview,Z6rxFNMGdn0,Ian Goodfellow: Generative Adversarial Network...,Ian Goodfellow is an author of the popular tex...,Lex Fridman,True,True,the following is a conversation with Ian good ...,0.0,79.0
1,ian_goodfellow,interview,Z6rxFNMGdn0,Ian Goodfellow: Generative Adversarial Network...,Ian Goodfellow is an author of the popular tex...,Lex Fridman,True,True,of implies that there may be limits to deep le...,83.0,152.0
2,ian_goodfellow,interview,Z6rxFNMGdn0,Ian Goodfellow: Generative Adversarial Network...,Ian Goodfellow is an author of the popular tex...,Lex Fridman,True,True,most reinforcement learning algorithms have a ...,152.0,215.0
3,ian_goodfellow,interview,Z6rxFNMGdn0,Ian Goodfellow: Generative Adversarial Network...,Ian Goodfellow is an author of the popular tex...,Lex Fridman,True,True,involved in AI when we used machine learning w...,215.0,280.0
4,ian_goodfellow,interview,Z6rxFNMGdn0,Ian Goodfellow: Generative Adversarial Network...,Ian Goodfellow is an author of the popular tex...,Lex Fridman,True,True,get these kind of grandmother's cell units tha...,280.0,345.0
...,...,...,...,...,...,...,...,...,...,...,...
13864,andy_serkis,interview,OlNQ9T_zOVs,Conversations with Andy Serkis of WAR FOR THE ...,Q&A with Andy Serkis. Moderated by Jenelle Ril...,SAG-AFTRA Foundation,True,True,just this whole canvas is opening up performan...,1621.0,1679.0
13865,andy_serkis,interview,OlNQ9T_zOVs,Conversations with Andy Serkis of WAR FOR THE ...,Q&A with Andy Serkis. Moderated by Jenelle Ril...,SAG-AFTRA Foundation,True,True,not going to know I'm gonna step outside and y...,1679.0,1746.0
13866,andy_serkis,interview,OlNQ9T_zOVs,Conversations with Andy Serkis of WAR FOR THE ...,Q&A with Andy Serkis. Moderated by Jenelle Ril...,SAG-AFTRA Foundation,True,True,it's sort of licenses the talking animals aspe...,1746.0,1804.0
13867,andy_serkis,interview,OlNQ9T_zOVs,Conversations with Andy Serkis of WAR FOR THE ...,Q&A with Andy Serkis. Moderated by Jenelle Ril...,SAG-AFTRA Foundation,True,True,no it says I know this is supposed to be about...,1804.0,1872.0


# Update GitHub

In [97]:
m.save('Combined datasets')