In [12]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc, precision_score, recall_score, f1_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# pip install spacy-universal-sentence-encoder
import spacy_universal_sentence_encoder
nlp = spacy_universal_sentence_encoder.load_model('en_use_lg')

from functools import reduce
from tqdm import tqdm
import gensim
import gensim.corpora as corpora

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from Preprocessing import Preprocessor
preprocessor = Preprocessor(0)
tqdm.pandas()

# import necessary libraries
import tensorflow_hub as hub
  
# Load pre-trained universal sentence encoder model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")



Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/5, Total size: 577.10MB

Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB



In [40]:
ted = pd.read_csv("ted_joined.csv")
ted.head(5)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,speaker_occupation,tags,title,urlurl,views,transcript,urlurl.1,transcript_cleaned,tokens,transcript_n_entries
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,...,Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,"good morning. you? it great, it? blown away wh...","['good', 'morning.', 'you?', 'it', 'great,', '...",1484
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,...,Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,"thank much, chris. truly great honor opportuni...","['thank', 'much,', 'chris.', 'truly', 'great',...",1012
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,...,Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,"(music: the sound silence, simon & garfunkel)h...","['(music:', 'the', 'sound', 'silence,', 'simon...",1673
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,...,Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,todayhappyheard sustainable development save u...,"['todayhappyheard', 'sustainable', 'developmen...",1624
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,...,Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,"10 years ago, took task teach global developme...","['10', 'years', 'ago,', 'took', 'task', 'teach...",1535


## Attempt to load and train a doc2vec model from gensim
This model is going to be used to encode each of the documents from the ted talks dataset. The below code is from [TutorialsPoint gensim doc2vec](https://www.tutorialspoint.com/gensim/gensim_doc2vec_model.htm). We can directly see that it creates embeddings through the infer_vector method. This method takes in a list of tokens. This works well for us since we already tokenized each of the Ted Talk transcripts.

We can also see that during the training of the Doc2vec model we can specify the size of the encoding vector. This will allow us to fine tune things as needed.

In [None]:
import gensim
import gensim.downloader as api
dataset = api.load("text8")
data = [d for d in dataset]
def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])
data_for_training = list(tagged_document(data))
print(data_for_training[:1])
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
model.build_vocab(data_training)
model.train(data_training, total_examples=model.corpus_count, epochs=model.epochs)
print(model.infer_vector(['violent', 'means', 'to', 'destroy', 'the','organization']))

### Test on a single Ted Talk transcript before running all of them.

In [16]:
model.infer_vector(ted['tokens'][0].split())

array([-0.00631284,  0.00932574,  0.00080046,  0.00132854, -0.00314726,
        0.01177171,  0.00501487,  0.00134591,  0.0050741 ,  0.00663095,
       -0.01054779, -0.00255214, -0.00689085, -0.01214463, -0.00210388,
        0.00383411,  0.00503022, -0.00729396, -0.01173384, -0.00319082,
       -0.01201588,  0.00270767, -0.01157962, -0.00540551,  0.00650618,
       -0.01193785, -0.00168675,  0.00965464, -0.00031757,  0.00912039,
        0.00183086, -0.00168201, -0.001378  , -0.00429492, -0.00134862,
        0.00538502,  0.01003118,  0.00506399,  0.00149683,  0.01077986],
      dtype=float32)

## Embed all of the transcripts

In [55]:
embeddings = []
for tokens in tqdm(ted['tokens']):
    embedding = model.infer_vector(tokens.split())
    embeddings.append(embedding)
len(embeddings)

100%|██████████| 2461/2461 [00:07<00:00, 347.67it/s]


2461

In [57]:
ted['embeddings'] = embeddings
ted.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,title,urlurl,views,transcript,urlurl.1,transcript_cleaned,tokens,transcript_n_entries,token_encoding,embeddings
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,...,Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,"good morning. you? it great, it? blown away wh...","['good', 'morning.', 'you?', 'it', 'great,', '...",1484,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[-0.0063128383, 0.009325745, 0.00080046355, 0...."
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,...,Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,"thank much, chris. truly great honor opportuni...","['thank', 'much,', 'chris.', 'truly', 'great',...",1012,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0005233243, 0.004493335, -0.0035742647, 0.0..."
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,...,Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,"(music: the sound silence, simon & garfunkel)h...","['(music:', 'the', 'sound', 'silence,', 'simon...",1673,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[-0.003940201, 0.004838261, -0.008605291, 0.01..."
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,...,Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,todayhappyheard sustainable development save u...,"['todayhappyheard', 'sustainable', 'developmen...",1624,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.00033427178, 0.0035245598, -0.0059836516, 0..."
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,...,The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,"10 years ago, took task teach global developme...","['10', 'years', 'ago,', 'took', 'task', 'teach...",1535,"[[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,...","[-0.0021584958, 0.0010347426, 0.0119885225, 0...."


## Get a list of all possible tags

In [41]:
tags = []
topics = []
for video in tqdm(ted['tags']):
    video = video.replace('[', '').replace(']', '').replace('\'', '').replace('\"', '').split(', ')
    topics += video
    tags.append(video)
ted['tags'] = tags
topics = list(set(topics))
print(topics[0:10])
print(len(topics))

100%|██████████| 2461/2461 [00:00<00:00, 615257.92it/s]

['sex', 'suicide', 'software', 'empathy', 'machine learning', 'asteroid', 'violin', 'compassion', 'depression', 'interface design']
415





## Setup a MultiLabelBinarizer
from [Scikit Learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html)

In [42]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit([topics])
print(len(mlb.classes_))
mlb.classes_

415


array(['"Alzheimers"', '3d printing', 'AI', 'AIDS', 'Addiction', 'Africa',
       'Anthropocene', 'Asia', 'Autism spectrum disorder', 'Bioethics',
       'Blindness', 'Brand', 'Brazil', 'Buddhism', 'CRISPR',
       'Christianity', 'Criminal Justice', 'DNA', 'Debate', 'Egypt',
       'Europe', 'Foreign Policy', 'Gender equality', 'Gender spectrum',
       'God', 'Google', 'Guns', 'HIV', 'Human body', 'Internet', 'Iran',
       'Islam', 'LGBT', 'MacArthur grant', 'Mars', 'Middle East', 'Moon',
       'NASA', 'Natural resources', 'New York', 'Nobel prize', 'PTSD',
       'Planets', 'Senses', 'Slavery', 'South America', 'String theory',
       'Surgery', 'Surveillance', 'Syria', 'TED Books', 'TED Brain Trust',
       'TED Fellows', 'TED Prize', 'TED Residency', 'TED en Español',
       'TED-Ed', 'TEDMED', 'TEDNYC', 'TEDYouth', 'TEDx', 'Transgender',
       'United States', 'Vaccines', 'activism', 'adventure',
       'advertising', 'aging', 'agriculture', 'aircraft', 'algorithm',
       'al

## Transform the Ted Talk Tags using the MLB

In [60]:
# first a test
print([ted['tags'][0]])
print(mlb.transform([ted['tags'][0]])[0])

[['children', 'creativity', 'culture', 'dance', 'education', 'parenting', 'teaching']]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]


In [63]:
tag_encoding = []
for tags in tqdm(ted['tags']):
    tag_encoding.append(mlb.transform([tags])[0])
len(tag_encoding)

100%|██████████| 2461/2461 [00:00<00:00, 27037.63it/s]


2461

In [64]:
ted['tag_encoding'] = tag_encoding
ted.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,urlurl,views,transcript,urlurl.1,transcript_cleaned,tokens,transcript_n_entries,token_encoding,embeddings,tag_encoding
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,...,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,"good morning. you? it great, it? blown away wh...","['good', 'morning.', 'you?', 'it', 'great,', '...",1484,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.0063128383, 0.009325745, 0.00080046355, 0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,...,https://www.ted.com/talks/al_gore_on_averting_...,3200520,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,"thank much, chris. truly great honor opportuni...","['thank', 'much,', 'chris.', 'truly', 'great',...",1012,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0005233243, 0.004493335, -0.0035742647, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,...,https://www.ted.com/talks/david_pogue_says_sim...,1636292,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,"(music: the sound silence, simon & garfunkel)h...","['(music:', 'the', 'sound', 'silence,', 'simon...",1673,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.003940201, 0.004838261, -0.008605291, 0.01...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,...,https://www.ted.com/talks/majora_carter_s_tale...,1697550,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,todayhappyheard sustainable development save u...,"['todayhappyheard', 'sustainable', 'developmen...",1624,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.00033427178, 0.0035245598, -0.0059836516, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,...,https://www.ted.com/talks/hans_rosling_shows_t...,12005869,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,"10 years ago, took task teach global developme...","['10', 'years', 'ago,', 'took', 'task', 'teach...",1535,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[-0.0021584958, 0.0010347426, 0.0119885225, 0....","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


## Train a simple Model
Note that we do not want to use a tree based model with our encoded targets because that can cause the tree to be very sparse.

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [86]:
X = ted['embeddings']
# y = ted['tag_encoding']
y = mlb.transform(ted['tags'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [91]:
X_train

226     [-0.0076125725, 0.008616281, -0.0018003412, -0...
1313    [0.004246713, -0.011347116, -0.0072755693, 0.0...
2428    [-0.006065508, 0.005633655, -0.0007973455, -0....
715     [-0.0035077282, -0.0021022998, 0.01033441, -0....
1279    [0.011278771, -0.0070805876, -0.003374701, -0....
Name: embeddings, dtype: object

In [92]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [93]:
# X_train = X_train.iloc[0:5]
# y_train = y_train.iloc[0:5]
# X_train

In [94]:
# y_train

In [95]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
# clf = MultiOutputClassifier(LogisticRegression()).fit(X_train, y_train)
# clf = OneVsRestClassifier(LinearSVC()).fit(X_train, y_train)
clf = MultiOutputClassifier(LinearSVC()).fit(X_train, y_train)

ValueError: setting an array element with a sequence.

In [66]:
# clf = RandomForestClassifier(max_depth=2, random_state=0)
# clf.fit(X_train, y_train)

ValueError: setting an array element with a sequence.