In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from functions.preprocessing import preprocessing

import ast, pickle
from tqdm.auto import tqdm 

import warnings
warnings.filterwarnings('ignore')

In [2]:
transcripts = pd.read_csv('data/raw/transcripts.csv')
main = pd.read_csv('data/raw/ted_main.csv',usecols=['title','url','tags'])
transcripts.shape

(2467, 2)

## Filtering of transcripts###

><b>Rationale</b><br>
   &emsp;Firstly, we discovered that the numbers of records in the ted_main.csv and transcripts.csv are different, indicating that there are talks without transcripts. Upon viewing some of those talks, we realized that TED Talks actually include non-speech talks, including musical performances and live events.<br>
    &emsp;Therefore, we purposely selected a few talks with performance related tags, such as "performance art" and "music performance", and looked through the transcript. We realized that, for some singing-performance talks, the transcripts are actually the lyrics of the songs; for others, the transcripts may be a short self-introduction with some comments by the performer. Moreover, the transcripts tends to involve special symbols, such as ♫, ♪. <br>
    &emsp;As a result, we used the tag and the length of the transcript as filters. After some trial and error with manual validation, the filters are: <ul>
    <li> Tags to exclude: "performance art" and "live performance" </li>
    <li> Min Transcript Lengths: 350 </li></ul>
    &emsp;The reason why we did not exclude talks by just those symbols (i.e. remove all talks with those symbols) is that we found out that some talks may have background music as part of the talk. Those symbols will thus appear in the transcripts, but the talks are not performances. <br>
    &emsp;32 talks are removed in the end, with 2435 talks remaining.

In [4]:
merged = transcripts.merge(main,on='url')
merged['tags'] = merged.tags.map(ast.literal_eval)
merged['to_remove']=merged.apply(lambda x :
    (len(x['transcript'])<350) |  ('live performance' in x['tags']) | ('performance art' in x['tags']) , axis=1)
merged = merged[merged.to_remove == False]

In [5]:
merged.shape

(2435, 5)

## Preprocessing##

In [9]:
# convert the chinese inverted commas to the English version
transcripts.transcript = transcripts.transcript.str.replace('’',"'")
merged.transcript = transcripts.transcript

In [32]:
tqdm.pandas()
transcript_tokens = merged.transcript.progress_apply(preprocessing)

HBox(children=(IntProgress(value=0, max=2435), HTML(value='')))

In [33]:
title_tokens = merged.title.progress_apply(preprocessing)

HBox(children=(IntProgress(value=0, max=2435), HTML(value='')))

## Export to pickle

In [35]:
pickle.dump(transcript_tokens, open('data/pickle/transcript_tokens.p','wb'))

In [34]:
pickle.dump(title_tokens, open('data/pickle/title_tokens.p','wb'))

In [10]:
pickle.dump(merged.iloc[:,:-1], open('data/pickle/filtered_talks.p','wb'))