Useful links

1. For the architecture https://towardsdatascience.com/deep-learning-for-specific-information-extraction-from-unstructured-texts-12c5b9dceada
2. https://androidkt.com/multi-label-text-classification-in-tensorflow-keras/
3. https://keras.io/preprocessing/sequence/
4. https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/ ( Not really)
5. For deep learning using word embeddings https://stackabuse.com/python-for-nlp-multi-label-text-classification-with-keras/



In [1]:
import spacy
import pandas as pd
from tqdm import tqdm

In [2]:
DATA_DIR = "../../data/processed/"
INPUT_FILE_NAME = 'cleaned_squash3_with_pos_ner.parquet'

In [3]:
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)
df.head()

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags,squash2_tags,squash3_tags,pos_sequence,ner_sequence
0,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0,"[thank, chris, truly, great, honor, opportunit...",thank chris truly great honor opportunity come...,"culture,politics,science,climate change,enviro...","culture,politics,science,global issues,environ...","culture,politics,science,global issues,environ...",VERB PROPN ADV ADJ NOUN NOUN VERB NOUN ADV ADV...,PERSON ORG ORG GPE LOC ORG PRODUCT GPE GPE PER...
1,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0,"[term, invention, like, tell, tale, favorite, ...",term invention like tell tale favorite project...,"invention,engineering,design,global issues","invention,engineering,design,global issues","invention,design,global issues",NOUN NOUN SCONJ VERB PROPN ADJ NOUN VERB NOUN ...,GPE DATE CARDINAL DATE ORG PERSON LOC ORG GPE ...
2,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0,"[public, dewey, long, ago, observe, constitute...",public dewey long ago observe constitute discu...,"poverty,economics,culture,politics,policy,glob...","inequality,economics,culture,politics,governme...","inequality,economics,culture,politics,global i...",ADJ PROPN ADV ADV VERB ADJ NOUN NOUN PROPN PRO...,DATE NORP ORDINAL DATE MONEY DATE DATE DATE EV...
3,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0,"[want, start, say, houston, problem, enter, se...",want start say houston problem enter second ge...,"invention,engineering,entrepreneur,design,busi...","invention,engineering,entrepreneur,design,busi...","invention,design,business",VERB NOUN VERB PROPN NOUN VERB ADJ NOUN NOUN N...,GPE ORDINAL ORG PERSON DATE DATE DATE TIME PER...
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua...",want talk background idea car art actually mea...,"invention,design,technology,business,art","invention,design,technology,business,art","invention,design,technology,business,art",VERB NOUN NOUN NOUN NOUN NOUN ADV ADJ NOUN NOU...,PERSON PRODUCT ORG ORG PERSON PERSON PERSON OR...


In [4]:
df.iloc[:,:14].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328 entries, 0 to 2327
Data columns (total 14 columns):
speaker                    2328 non-null object
headline                   2328 non-null object
description                2328 non-null object
duration                   2328 non-null object
tags                       2328 non-null object
transcript                 2328 non-null object
WC                         2328 non-null float64
clean_transcript           2328 non-null object
clean_transcript_string    2328 non-null object
squash_tags                2328 non-null object
squash2_tags               2328 non-null object
squash3_tags               2328 non-null object
pos_sequence               2328 non-null object
ner_sequence               2328 non-null object
dtypes: float64(1), object(13)
memory usage: 254.8+ KB


In [5]:
def print_full_dataframe(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
    
def compute_tag_ratio(target_column, df=df):
    tags = df[target_column].str.replace(', ',',').str.lower().str.strip()
    split_tags = tags.str.split(',')
    tag_counts_per_talk = split_tags.apply(len)

    joined_tags = tags.str.cat(sep=',').split(',')
    all_tags = pd.Series(joined_tags)

    tag_counts = all_tags.value_counts().rename_axis(target_column).reset_index(name='counts')
    tag_counts['no_count'] = len(df)-tag_counts['counts']
    tag_counts['ratio'] = tag_counts['counts']/tag_counts['no_count']
    tag_counts['overall_ratio'] = tag_counts['counts']/(tag_counts['no_count'] + tag_counts['counts'])
    return tag_counts

#print(compute_tag_ratio('squash3_tags', df))
squashed_tag_counts = compute_tag_ratio('squash3_tags', df)
print_full_dataframe(squashed_tag_counts)

     squash3_tags  counts  no_count     ratio  overall_ratio
0         culture    1106      1222  0.905074       0.475086
1         science     868      1460  0.594521       0.372852
2      technology     787      1541  0.510707       0.338058
3   global issues     679      1649  0.411765       0.291667
4          design     400      1928  0.207469       0.171821
5        business     329      1999  0.164582       0.141323
6   entertainment     285      2043  0.139501       0.122423
7             art     261      2067  0.126270       0.112113
8          future     218      2110  0.103318       0.093643
9    biodiversity     215      2113  0.101751       0.092354
10      education     206      2122  0.097078       0.088488
11  communication     185      2143  0.086328       0.079467
12       politics     183      2145  0.085315       0.078608
13       humanity     164      2164  0.075786       0.070447
14  collaboration     163      2165  0.075289       0.070017
15           life     15

# 3. Feature Extraction via Deep learning

## 3.1 Create one hot encoding

In [6]:
# from sklearn.preprocessing import MultiLabelBinarizer

# y = []
# for index, row in df.iterrows():
#     y.append(set(row['squash3_tags'].split(',')))
    
# mlb = MultiLabelBinarizer()
# encoded_y = mlb.fit_transform(y)

In [7]:
# print(encoded_y[0])
# print(len(encoded_y[0]))

In [8]:
joined_tags = df['squash3_tags'].str.cat(sep=',').split(',')
all_tags = pd.Series(joined_tags).str.strip().str.lower()
all_tags = list(dict.fromkeys(all_tags))
try:
    all_tags.remove('')
except:
    pass
print(all_tags)
print(len(all_tags))

['culture', 'politics', 'science', 'global issues', 'environment', 'technology', 'invention', 'design', 'inequality', 'economics', 'business', 'art', 'biodiversity', 'music', 'entertainment', 'collaboration', 'education', 'history', 'future', 'communication', 'community', 'activism', 'children', 'brain', 'humanity', 'life']
26


In [9]:
def create_one_hot_encode(df=df):
    complete_transcripts_tags = []
    for rows, value in df.iterrows():
        one_hot_encoding = [0] * 26
        headline = [value['headline']]
        transcript = [value['clean_transcript_string']]
        pos_sequence = [value['pos_sequence']]
        ner_sequence = [value['ner_sequence']]
        indiv_tags = value['squash3_tags'].split(',')
        for tags in indiv_tags:
            if tags == '':
                continue
            index = all_tags.index(tags.lower().lstrip(' '))
            one_hot_encoding[index] = 1
        indiv_transcript_tags = headline + transcript + pos_sequence + ner_sequence + one_hot_encoding
        complete_transcripts_tags.append(indiv_transcript_tags)
    return pd.DataFrame(complete_transcripts_tags, columns=['headline', 'transcript', 'pos_sequence', 'ner_sequence'] + all_tags)

In [10]:
df = create_one_hot_encode()
df

Unnamed: 0,headline,transcript,pos_sequence,ner_sequence,culture,politics,science,global issues,environment,technology,...,education,history,future,communication,community,activism,children,brain,humanity,life
0,Averting the climate crisis,thank chris truly great honor opportunity come...,VERB PROPN ADV ADJ NOUN NOUN VERB NOUN ADV ADV...,PERSON ORG ORG GPE LOC ORG PRODUCT GPE GPE PER...,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Simple designs to save a life,term invention like tell tale favorite project...,NOUN NOUN SCONJ VERB PROPN ADJ NOUN VERB NOUN ...,GPE DATE CARDINAL DATE ORG PERSON LOC ORG GPE ...,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,How to rebuild a broken state,public dewey long ago observe constitute discu...,ADJ PROPN ADV ADV VERB ADJ NOUN NOUN PROPN PRO...,DATE NORP ORDINAL DATE MONEY DATE DATE DATE EV...,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The real future of space exploration,want start say houston problem enter second ge...,VERB NOUN VERB PROPN NOUN VERB ADJ NOUN NOUN N...,GPE ORDINAL ORG PERSON DATE DATE DATE TIME PER...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Great cars are great art,want talk background idea car art actually mea...,VERB NOUN NOUN NOUN NOUN NOUN ADV ADJ NOUN NOU...,PERSON PRODUCT ORG ORG PERSON PERSON PERSON OR...,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2323,Why glass towers are bad for city life -- and ...,imagine walk even discover everybody room look...,VERB NOUN ADV VERB PRON NOUN VERB ADV NOUN NOU...,ORG GPE ORG GPE GPE GPE GPE GPE GPE GPE GPE PE...,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2324,What happens in your brain when you pay attent...,pay close attention easy attention pull differ...,VERB ADJ NOUN ADJ NOUN VERB ADJ NOUN NOUN NOUN...,ORDINAL PERSON PRODUCT DATE EVENT,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2325,Why you should define your fears instead of yo...,happy pic take senior college right dance prac...,ADJ PROPN VERB ADJ NOUN ADJ NOUN NOUN ADJ VERB...,DATE PERSON ORG PERSON PERSON GPE PERSON GPE O...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2326,12 truths I learned from life and writing,sevenyearold grandson sleep hall wake lot morn...,PROPN PROPN PROPN PROPN VERB NOUN NOUN VERB VE...,PERSON PERSON PERSON PERSON PERSON DATE CARDIN...,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [11]:
def get_target_column(target_tag, df=df):
    return df[['headline', 'transcript','pos_sequence', 'ner_sequence', target_tag]]
#single_class = get_target_column('technology', df)
single_class = get_target_column('culture', df)

In [12]:
# df_x = single_class[['transcript']]
# df_y = df[['technology']]

In [13]:
df_x = list(single_class['transcript'])
# df_y = list(single_class['technology'])
df_y = list(single_class['culture'])
print(df_x[0])

thank chris truly great honor opportunity come stage twice extremely grateful blow away conference want thank nice comment night sincerely partly mock sob need position fly air force year shoe boot airplane tell quick story illustrate like true story bite true soon tipper leave mock sob white house drive home nashville little farm mile east nashville drive know sound like little thing look rearview mirror sudden hit motorcade hear phantom limb pain rent ford taurus dinnertime start look place eat get exit lebanon tennessee get exit shoneys restaurant lowcost family restaurant chain know go sit booth waitress come big commotion tipper take order go couple booth low voice strain hear say say yes vice president al gore wife tipper man say come long way kind series epiphany day continue totally true story get gv fly africa speech nigeria city lagos topic energy begin speech tell story happen day nashville tell pretty way share tipper drive shoneys lowcost family restaurant chain man say la

## 3.2 Perform train test split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, train_y, valid_y = train_test_split(df_x, df_y, random_state=1000)

In [15]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, Conv1D, MaxPooling1D
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import numpy as np

Using TensorFlow backend.


## 3.3 Use word embeddings for the main transcript

In [16]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 800 # too many and the model cant tell the difference 

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [17]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('./glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [18]:
print(X_train[1])
print(X_train.shape)

[ 154  591   35 2075   35  260 1407 4414  597   31 1734 1346  464   73
  154    2 3195  395 1382 3195  215  587 1451  531 1704 2316  205  101
  350  587 4256  504   21 3271  330  233  272  315 4505   11  632  829
   53  909    2  487  512 2229 1201  279  450  106   69 1825 2145  902
  152  178 1017 1601    2 1734 1346 4415 1346  122 1734  196 1123 2944
 1991  279  450   87 1734 1346 1066   35 1916 2903  279  142  183  673
  198 2184  408  301  279  450  151  198  193  305  420    1 4865 1124
  488  125 3049   21  594   73    1 1445 2719 4506  122   31  122   15
   10  378 2833  165 2567 4506  165  179   58   48   98  599   16  131
   36  254 4506 1337  177 2834 2591   83  271  356  533   91 1293   35
  279  614  614  224  352  533    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [19]:
print(train_y[0])

1


# Model

In [20]:
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(1, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
history = model.fit(X_train, train_y, batch_size=128, epochs=4, verbose=1, validation_split=0.2)












Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 1396 samples, validate on 350 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [21]:
# model_glove = Sequential()
# model_glove.add(Embedding(vocab_size, 100, input_length=3000, weights=[embedding_matrix], trainable=False))(Input(shape=(maxlen,)))
# model_glove.add(Dropout(0.2))
# model_glove.add(Conv1D(64, 5, activation='relu'))
# model_glove.add(MaxPooling1D(pool_size=4))
# model_glove.add(LSTM(100))
# model_glove.add(Dense(1, activation='sigmoid'))
# model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# ## Fit train data
# model_glove.fit(X_train, np.array(train_y), validation_split=0.2, epochs = 3)

In [22]:
predictions = model.predict(X_test)


In [58]:
def get_tag(threshold, predictions=predictions):
    return [[1 if j > threshold else 0 for j in i.tolist()] for i in predictions]
predictions_flushed = get_tag(0.45, predictions)

In [59]:
def compute_tp_tn_fp_fn(y_test, y_pred, classes):
    '''
    Return:
    pre_score = {
        'tag_1': {
            'index': ,
            'tp': ,
            'tn': ,
            'fp': ,
            'fn': 
        }
    }
    '''
    # Create dictionary of tags 
    pre_score = {}
    for index_tag, tag in enumerate(classes):
        pre_score[tag] = {
            'index':index_tag,
            'tp': 0,
            'tn': 0,
            'fp': 0,
            'fn': 0
        }
    for transcript_index, transcript_value in enumerate(y_test):
        if transcript_value == y_pred[transcript_index][0] and transcript_value == 1:
            pre_score[classes[0]]['tp'] += 1
        elif transcript_value == y_pred[transcript_index][0] and transcript_value == 0:
            pre_score[classes[0]]['tn'] += 1
        elif transcript_value != y_pred[transcript_index][0] and transcript_value == 1:
            pre_score[classes[0]]['fn'] += 1
        elif transcript_value != y_pred[transcript_index][0] and transcript_value == 0:
            pre_score[classes[0]]['fp'] += 1
    return pre_score
#scores_preprocess = compute_tp_tn_fp_fn(valid_y, predictions_flushed, ['technology'])
scores_preprocess = compute_tp_tn_fp_fn(valid_y, predictions_flushed, ['culture'])

In [60]:
def compute_precision_recall_f1(preprocessed_scores):
    for key, value in preprocessed_scores.items():
        try:
            precision = value['tp']/(value['tp']+value['fp'])
        except:
            print('precision issue: {}'.format(key))
            precision = 0.0
        try:
            recall = value['tp']/(value['tp']+value['fn'])
        except:
            print('recall issue: {}'.format(key))
            recall = 0.0
        try:
            f1 = (2 * precision * recall)/(precision + recall)
        except:
            print('f1 issue: {}'.format(key))
            f1=0.0
        preprocessed_scores[key]['precision'] = round(precision,2)
        preprocessed_scores[key]['recall'] = round(recall,2)
        preprocessed_scores[key]['f1'] = round(f1,2)
    return preprocessed_scores
final_scores = compute_precision_recall_f1(scores_preprocess)
print(final_scores)

{'culture': {'index': 0, 'tp': 273, 'tn': 13, 'fp': 288, 'fn': 8, 'precision': 0.49, 'recall': 0.97, 'f1': 0.65}}


In [61]:
def print_full_dataframe(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [62]:
def format_scores_df(tag_classes, final_scores=final_scores):
    precision = []
    recall = []
    f1 = []
    accuracy = []
    for index, value in enumerate(tag_classes):
        precision.append(final_scores[value]['precision'])
        recall.append(final_scores[value]['recall'])
        f1.append(final_scores[value]['f1'])
        accuracy.append((final_scores[value]['tp'] + final_scores[value]['tn'])/(final_scores[value]['tp'] + final_scores[value]['tn'] + final_scores[value]['fp'] + final_scores[value]['fn']))
    df_result = pd.DataFrame(list(zip(tag_classes, precision, recall, f1, accuracy)), 
               columns =['class', 'precision', 'recall', 'f1', 'accuracy']) 
    return df_result
# df_results = format_scores_df(['technology'], final_scores)
df_results = format_scores_df(['culture'], final_scores)
print_full_dataframe(df_results)

     class  precision  recall    f1  accuracy
0  culture       0.49    0.97  0.65  0.491409
