In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pprint
import seaborn as sns
#import gensim

%matplotlib inline

In [2]:
DATA_DIR = "../../data/raw/"
INPUT_FILE_NAME = 'subset_raw.parquet'

# Basic visualizations

In [3]:
ted_talks = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)

In [4]:
ted_talks.head(10)

Unnamed: 0_level_0,speaker,headline,description,duration,tags,transcript,WC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0
2,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0
3,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0
4,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0
5,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0
6,Craig Venter,Sampling the ocean's DNA,Genomics pioneer Craig Venter takes a break fr...,0:16:51,"biotech,invention,oceans,genetics,DNA,biology,...","0:11\r\r\rAt the break, I was asked by several...",2548.0
7,David Pogue,Simplicity sells,New York Times columnist David Pogue takes aim...,0:21:26,"simplicity,computers,software,interface design...","0:12\r\r\r(Music: ""The Sound of Silence,""\rSim...",3584.0
8,David Rockwell,A memorial at Ground Zero,In this emotionally charged conversation with ...,0:24:37,"New York,memory,interview,death,culture,archit...","0:13\r\r\rKurt Andersen: Like many architects,...",4008.0
9,Dean Kamen,To invent is to give,Inventor Dean Kamen lays out his argument for ...,0:20:07,"robots,cars,industrial design,transportation,i...","0:11\r\r\rAs you pointed out, every time you c...",3278.0
10,Dean Ornish,The killer American diet that's sweeping the p...,Forget the latest disease in the news: Cardiov...,0:03:18,"obesity,disease,health,health care,culture,foo...",0:11\r\r\rWith all the legitimate concerns\rab...,623.0


In [5]:
ted_talks.iloc[:,:15].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2475 entries, 1 to 2804
Data columns (total 7 columns):
speaker        2475 non-null object
headline       2475 non-null object
description    2475 non-null object
duration       2475 non-null object
tags           2475 non-null object
transcript     2386 non-null object
WC             2386 non-null float64
dtypes: float64(1), object(6)
memory usage: 154.7+ KB


## Based on first parse of data, we can see that

1. id is unique and can be used as index
2. transcript contains some nulls, could possibly drop these rows
3. tags can be a list of strings rather than a whole string
4. date published can be datetime format

In [8]:
ted_talks = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)
ted_talks = ted_talks.dropna(subset=['transcript'])
ted_talks.head()

Unnamed: 0_level_0,speaker,headline,description,duration,tags,transcript,WC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0
2,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0
3,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0
4,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0
5,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0


In [9]:
ted_talks.iloc[:,:15].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2386 entries, 1 to 2804
Data columns (total 7 columns):
speaker        2386 non-null object
headline       2386 non-null object
description    2386 non-null object
duration       2386 non-null object
tags           2386 non-null object
transcript     2386 non-null object
WC             2386 non-null float64
dtypes: float64(1), object(6)
memory usage: 149.1+ KB


In [10]:
# Process tags
tags = ted_talks['tags'].str.replace(', ',',').str.lower().str.strip()
split_tags = tags.str.split(',')
tag_counts_per_talk = split_tags.apply(len)

joined_tags = tags.str.cat(sep=',').split(',')
all_tags = pd.Series(joined_tags)

tag_counts = all_tags.value_counts()

cumulative_tag_counts = np.cumsum(tag_counts)
cumulative_tag_counts.index = range(len(cumulative_tag_counts))
tag_ratios = cumulative_tag_counts / sum(tag_counts)

In [11]:
# process duration
ted_talks['duration'] = pd.to_timedelta(ted_talks['duration'])
talk_time_minutes = ted_talks['duration'].dt.total_seconds()/60

In [12]:
from bokeh.io import output_notebook, show
from bokeh.layouts import row, column
from bokeh.plotting import figure
from bokeh.models.tools import HoverTool
from bokeh.transform import cumsum
from bokeh.models import ColumnDataSource, CustomJS
from bokeh.models.widgets import Slider

output_notebook()

## Time for some tag analysis, our main target

In [13]:
index = list(range(len(tag_counts)))
counts = list(tag_counts.values)
source = ColumnDataSource(data=dict(index=index, counts=counts))

p = figure(plot_height=400,
           title='Frequency of tags in documents')

p.line(x='index', y='counts',
       line_width=2, 
       color='mediumvioletred',
       source=source)

renderer = p.circle(x='index', y='counts', size=15,
              fill_color="grey", hover_fill_color="firebrick",
              fill_alpha=0.1, hover_alpha=0.3,
              line_color=None, hover_line_color="white", source=source)

p.xaxis.axis_label = 'Tag index'
p.yaxis.axis_label = 'Number of times tag appeared'

p.add_tools(HoverTool(tooltips=[("tag no.", "@index"),("counts", "@counts")],renderers=[renderer]))

show(p)

In [14]:
index = list(tag_ratios.index)
ratio = list(tag_ratios.values)
source = ColumnDataSource(data=dict(index=index, ratio=ratio))

p = figure(plot_height=400,
           title='Total % of tags to number of tags included')

p.line(x='index', y='ratio',
           line_width=1.5, 
           alpha=0.7,
           color='mediumvioletred',
           source=source)
renderer = p.vbar(x='index', top='ratio',
           width=1, 
           alpha=0.2,
           color='mediumvioletred',
           source=source)

p.xaxis.axis_label = 'Number of tags included'
p.yaxis.axis_label = '% of total tags'

p.add_tools(HoverTool(tooltips=[("tag no.", "@index"),("% of total tags", "@ratio")],renderers=[renderer]))

show(p)

### Seems like there are some rarely used tags that can be pruned when training our model, possibly set a threshold before including tags in in our analysis

In [15]:
n_topics = 10
top_tag_counts = tag_counts[:n_topics][::-1]
tags = list(top_tag_counts.index)
counts = list(top_tag_counts.values)
transparency = list(counts/max(counts))
source = ColumnDataSource(data=dict(tags=tags, counts=counts, transparency=transparency))
p = figure(y_range = tags,
           plot_height=350,
           title = f'Top {n_topics} tags on TED')

renderer = p.hbar(y='tags',
           right='counts',
           height=0.8,
           alpha='transparency', 
           color='red',
           hover_color='lightgreen',
           source=source)

p.ygrid.grid_line_color=None
p.x_range.start=0

p.xaxis.axis_label = 'Number of times tag appeared'
p.yaxis.axis_label = 'Tags'

p.add_tools(HoverTool(tooltips=[("counts", "@counts")], 
                      renderers=[renderer]))

show(p)

In [16]:
talks = list(tag_counts_per_talk.value_counts().sort_index().index)
counts = list(tag_counts_per_talk.value_counts().sort_index().values)
source = ColumnDataSource(data=dict(talks=talks, counts=counts))
p = figure(x_range = (0,max(talks)),
           plot_height=400,
           plot_width=800,
           title = 'Number of tags in each talk')

renderer = p.vbar(x='talks',
                   top='counts',
                   width=0.9,
                   alpha=0.8, 
                   line_color='darkgrey',
                   fill_color='lightblue',
                   hover_color='orange',
                   source=source)

p.ygrid.grid_line_color=None
p.x_range.start=0

p.xaxis.axis_label = 'Number of tags in talk'
p.yaxis.axis_label = 'Number of talks'

p.add_tools(HoverTool(tooltips=[("n_tags", "@talks"),("counts", "@counts")], 
                      renderers=[renderer]))

show(p)

### If we plan to do multilabel classification, we might only need to predict <20 classes

## Moving on to talk duration

In [17]:
duration_hist, edges = np.histogram(talk_time_minutes, bins=30)
duration_data = pd.DataFrame({'n_talks': duration_hist,
                              'left': edges[:-1],
                              'right': edges[1:]})
duration_data['time_interval'] = ['%d to %d minutes' % (left, right) for left, right in zip(duration_data['left'], duration_data['right'])]
source = ColumnDataSource(data=duration_data)

p = figure(title='Histogram of talk durations in minutes',
           x_axis_label='Duration of talk in minutes',
           y_axis_label='Number of Talks')

renderer = p.quad(bottom=0, top='n_talks',
                  left='left', right='right',
                  fill_color='violet', line_color='purple', alpha=0.6, source=source)

p.ygrid.grid_line_color=None

p.add_tools(HoverTool(tooltips=[("num talks", "@n_talks"),("duration", "@time_interval")], 
                      renderers=[renderer]))

show(p)

## As Expected, ted talks are notorious for being strict with speaker's time, there is a sharp cutoff at 20min

## Lets see if talk duration is related the number of tags

In [18]:
source = ColumnDataSource(data=dict(tag_counts=tag_counts_per_talk, duration=talk_time_minutes))
p = figure(plot_width=800, title='Number of tags to talk duration')

p.circle(x='duration',
         y='tag_counts',
         size=6,
         alpha=0.4,
         
         source=source)

p.xaxis.axis_label = "Talk duration (in minutes)"
p.yaxis.axis_label = "Number of tags"
show(p)

In [19]:
n_speakers = 10
top_speaker_counts = ted_talks['speaker'].value_counts()[:n_speakers][::-1]
speakers = list(top_speaker_counts.index)
counts = list(top_speaker_counts.values)
transparency = list(counts/max(counts))
source = ColumnDataSource(data=dict(speakers=speakers, counts=counts, transparency=transparency))
p = figure(y_range = speakers,
           plot_height=350,
           title = f'Top {n_speakers} most frequent speakers')

renderer = p.hbar(y='speakers',
           right='counts',
           height=0.8,
           alpha='transparency', 
           color='orange',
           hover_color='lightgreen',
           source=source)

p.ygrid.grid_line_color=None
p.x_range.start=0

p.xaxis.axis_label = 'Number of ted talks given'
p.yaxis.axis_label = 'Speakers'

p.add_tools(HoverTool(tooltips=[("counts", "@counts")], 
                      renderers=[renderer]))

show(p)

# Start on text processing

In [21]:
nlp_cols = ['speaker', 'headline', 'description', 'duration', 'tags', 'transcript']

In [22]:
ted_transcripts = ted_talks.loc[:, nlp_cols]
display(ted_transcripts.head())

Unnamed: 0_level_0,speaker,headline,description,duration,tags,transcript
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,00:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ..."
2,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,00:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ..."
3,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,00:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r..."
4,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",00:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst..."
5,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,00:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac..."


### Clean transcript

In [23]:
# test_text = ted_transcripts['transcript'][1]

In [24]:
#Run and restart kernel

#!python -m spacy download en_core_web_sm

# or

#!python -m spacy download en_core_web_md

In [25]:
# import spacy
# nlp = spacy.load('en_core_web_sm')
# nlp = spacy.load('en_core_web_md')

In [26]:
# parsed_text = nlp(test_text)

In [34]:
input_text = """
I want to start off by saying, Houston, we have a problem.
We're entering a second generation of no progress
in terms of human flight in space. In fact, we've regressed.
"""

In [35]:
from joblib import dump, load
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [43]:
from joblib import dump, load
clf = load('gs_clf_svm.joblib')
sample_ls = [input_text]
sample_ls = np.array(sample_ls)
predicted_new = clf.predict(sample_ls)
print(predicted_new)

['architecture']


In [45]:
DATA_DIR = "../owentemple-ted-talks-complete-list"

data = DATA_DIR + '/data/ted_talks_by_id_plus_transcripts_and_liwc_and_mft_plus_views.csv'

In [97]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pprint
import seaborn as sns
from bokeh.io import output_notebook, show
from bokeh.layouts import row, column
from bokeh.plotting import figure
from bokeh.models.tools import HoverTool
from bokeh.models import ColumnDataSource, CustomJS
import string
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from sklearn.feature_extraction import stop_words
from nltk.stem.wordnet import WordNetLemmatizer

sets=[stop_words.ENGLISH_STOP_WORDS]
sklearnStopWords = [list(x) for x in sets][0]
token=ToktokTokenizer()
lemma=WordNetLemmatizer()
stopWordList=stopwords.words('english')
stopWords = stopWordList + sklearnStopWords
stopWords = list(dict.fromkeys(stopWords))
# import gensim
def stopWordsRemove(text):
    wordList=[x.lower().strip() for x in token.tokenize(text)]
    removedList=[x + ' ' for x in wordList if not x in stopWords]
    text=''.join(removedList)
    return text


def lemitizeWords(text):
#     words=token.tokenize(text)
#     listLemma=[]
#     for w in words:
#         x=lemma.lemmatize(w)
#         listLemma.append(x)
    doc = nlp(text)
    new_text = " ".join([token.lemma_ if token.lemma_ != "-PRON-" else token.lower_ for token in doc])
    
    return new_text


def load_data(filename):
    df = pd.read_csv(filename, index_col="id", parse_dates=['date_published'])
    df = df.dropna(subset=['transcript'])
    df['duration'] = pd.to_timedelta(df['duration'])
    # There is a mispelt word that needs to be replaced
    df['cleaned_transcript'] = df['transcript']
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace('\r',' ')
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("\'s"," is")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("\'m"," am")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("\'ll"," will")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("Can\'t","cannot")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("Sha\'t","shall not")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("Won\'t","would not")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("n\'t"," not")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("\'ve"," have")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("\'re"," are")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("\'d"," would")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace(r"\(([^)]+)\)","")
    # Deal with Mr. and Dr.
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("mr. ","mr ")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("Mr. ","m ")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("mrs. ","mrs ")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("Mrs. ","mrs ")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("Dr. ","dr ")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace("dr. ","dr ")
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace(r'\d+','')
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace(r'<.*?>','')
    for i in string.punctuation:
        if i == "'":
            df['cleaned_transcript'] = df['cleaned_transcript'].str.replace(i,'')
        else:
            df['cleaned_transcript'] = df['cleaned_transcript'].str.replace(i,' ')
    df['cleaned_transcript'] = df['cleaned_transcript'].str.replace('\s+',' ')
    
    df['lemmatized_transcript'] = df['cleaned_transcript']
    df['lemmatized_transcript'] = df['lemmatized_transcript'].map(lambda com : stopWordsRemove(com))
    df['lemmatized_transcript'] = df['lemmatized_transcript'].map(lambda com : lemitizeWords(com))
    df['lemmatized_transcript'] = df['lemmatized_transcript'].str.replace('\s+',' ')

    return df


ted_talks = load_data(data)

In [98]:
ted_talks[['cleaned_transcript', 'lemmatized_transcript']]

Unnamed: 0_level_0,cleaned_transcript,lemmatized_transcript
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Thank you so much Chris And it is truly a gre...,thank chris truly great honor opportunity come...
2,In terms of invention I would like to tell yo...,term invention like tell tale favorite project...
3,A public Dewey long ago observed is constitut...,public dewey long ago observe constitute discu...
4,I want to start off by saying Houston we have...,want start say houston problem enter second ge...
5,What I want to talk about is as background is...,want talk background idea car art actually qui...
...,...,...
2797,Imagine that when you walked in here this eve...,imagine walk evening discover everybody room l...
2798,Paying close attention to something Not that ...,pay close attention easy attention pull differ...
2799,So this happy pic of me was taken in I was a ...,happy pic take senior college right dance prac...
2801,My seven year old grandson sleeps just down t...,seven year old grandson sleeps hall wake lot m...


In [99]:
ted_talks.to_csv('st_input.csv')

In [78]:
import spacy
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])


Apples 8566208034543834098 apple
and 2283656566040971221 and
oranges 2208928596161743350 orange
are 10382539506755952630 be
similar 18166476740537071113 similar
. 12646065887601541794 .
Boots 9918665227421442029 boot
and 2283656566040971221 and
hippos 6542994350242320795 hippo
are 10382539506755952630 be
n't 447765159362469301 not
. 12646065887601541794 .


In [95]:
input_text = ted_talks['cleaned_transcript'][22]
