In [17]:
LSTM_Units=256
Max_Allowed_Sequence_Length=512
Target_Label_Weights=[0,.5/7.8,.5/1] #first element is for padding, 2nd for normal token, 3rd for sentence ending token

In [2]:
#7.8 in target label weighting above denoting prevalence of normal tokens vs sentence ending token comes from the following SQL.
# However, this counts words, not tokens. 
# And it doesn't handle abbreviations with periods or repetitive sentences enders like !!! or ??
#
#Select Avg(Words/Avg_Sentences) As Average_Words_Per_Sentence 
#	From (select Document_Id,Sum(Occurrence_Count) As Words 
#			From sys.dm_fts_index_keywords_by_document(db_id('TrainingCorpus'),object_id('TedTalk')) Word_Stat
#			Where Word_Stat.column_id=(Select Columns.Ordinal_Position 
#										From Information_Schema.COLUMNS 
#										Where TABLE_NAME='TedTalk' And COLUMN_NAME='Transcript')
#			Group By Document_Id) Foo,
#		(Select Avg(1.0*Len(Transcript)-Len(Replace(Replace(Replace(Transcript,'!',''),'?',''),'.',''))) As Avg_Sentences 
#			From TedTalk) Bar

In [3]:
def Get_Corpus_Transcripts():
    import pyodbc
    db = pyodbc.connect(r'Driver={SQL Server};Server=(local);Database=TrainingCorpus;Trusted_Connection=yes;',autocommit=True)
    dblink=db.cursor()
    Transcripts_Rows=dblink.execute("Select Transcript_lower_punc_only_sentence_boundary,TextIdxKey \
                                        From TedTalk \
                                        Where Transcript_lower_punc_only_sentence_boundary is not null").fetchall()
    dblink.close()
    db.close()
    Transcripts=dict()
    for row in Transcripts_Rows:
        Transcripts[row[1]]=row[0]
    return(Transcripts)

In [4]:
Transcripts=Get_Corpus_Transcripts()

In [5]:
def Build_Corpus_Dictionary(Transcripts):

    #Transcripts assumed to be list of lists of spoken words with only punctuation 
    #          being sentence ending punctuation, all of which are space separated 
    import pickle, os
    if not os.path.exists('Corpus_Token.pickle'):
        import collections,nltk
        Corpus_Tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
        Corpus_Token_Frequency = collections.Counter()
        for Transcript in Transcripts.values():
            for Token in Corpus_Tokenizer.tokenize(Transcript):
                Corpus_Token_Frequency[Token.lower()] += 1
                
        Corpus_Token_Index=dict({0:'<PAD>',1:'<OOV>'})
        Token_Number=2 # most frequent word/token
        for Token in collections.OrderedDict(Corpus_Token_Frequency.most_common()).keys():
            Corpus_Token_Index[Token]=Token_Number
            Idx+=1
        with open('Corpus_Token.pickle','wb') as Save_Corpus_Token:
            pickle.dump(Corpus_Token_Index,Save_Corpus_Token)
    else:
        Corpus_Token_Index=pickle.load(open('Corpus_Token.pickle','rb'))
    return(Corpus_Token_Index)

In [6]:
Corpus_Token_Index=Build_Corpus_Dictionary(Transcripts)

In [7]:
def Build_GloVe_Encoded_Corpus(Corpus_Token_Index,Transcripts):
    import numpy,os,pickle
    #Associate Corpus token with GloVe vector
    if not os.path.exists('glove300.pickle'):
        with open('glove.42B.300d.txt', encoding="utf8") as GloVe:  #uncased
            GloVe_Corpus_Index = numpy.zeros(shape=(len(Corpus_Token_Index)+2,300),dtype=numpy.float32)
            for Token_Vector in GloVe:
                Token, *Vector = Token_Vector.split()
                if Token in Corpus_Token_Index:
                    GloVe_Corpus_Index[Corpus_Token_Index[Token]]=numpy.array(vector[-300:], dtype=numpy.float32)
            GloVe_Corpus_Index[1]=numpy.mean(GloVe_Corpus_Index[2:,],axis=0) #OOV gets average GloVe vector
        with open('glove300.pickle','wb') as Save_Glove:
            pickle.dump(GloVe_Corpus_Index,Save_Glove)
    else:
        GloVe_Corpus_Index=pickle.load(open('glove300.pickle','rb'))

    #Set all words in Corpus_Token_Index that aren't in GloVe to index 1 (OOV) so they can be given the average GloVe Vector
    for Token,Token_Number in Corpus_Token_Index.items():
        if numpy.sum(GloVe_Corpus_Index[Token_Number,:])==0:  ##Corpus_Token_Not_In_GloVe
            Corpus_Token_Index[Token]=1
    return(Corpus_Token_Index,GloVe_Corpus_Index)

In [8]:
Corpus_Token_Index,GloVe_Corpus_Index=Build_GloVe_Encoded_Corpus(Corpus_Token_Index,Transcripts)

In [9]:
def Training_Text_To_Sequences(Transcripts,Corpus_Token_Index,Max_Allowed_Sequence_Length):
    import nltk,numpy
    Tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    Transcripts_Labels_Tokens=dict()
    Longest_Sequence=0
    for Transcript_Id,Transcript in Transcripts.items():
        Labels=[];Tokens=[];Sequence_Index=0;
        Transcript_Subset_Id=0
        for Token in Tokenizer.tokenize(Transcript):
            if any(Character in Token for Character in ['.','?','!']) and Sequence_Index>0:
                if Sequence_Index-1<0 or Sequence_Index-1>=len(Labels):
                    print(Transcript_Id,Transcript_Subset_Id,Sequence_Index,Tokens,Labels)
                Labels[Sequence_Index-1]=2  # also should cover situation where sentence ends with 
                                            # multiple sentence ending tokens (e.g !?!?!?)
            else:
                if Sequence_Index==Max_Allowed_Sequence_Length: # output this portion of the transcript 
                                                                # and prepare for next transcript portion
                    Longest_Sequence==Max_Allowed_Sequence_Length
                    Transcripts_Labels_Tokens[Transcript_Id,Transcript_Subset_Id]=(Labels,Tokens)
                    Labels=[];Tokens=[];Sequence_Index=0;
                    Transcript_Subset_Id+=1
                Tokens.append(Corpus_Token_Index[Token.lower()] if Token.lower() in Corpus_Token_Index else 1) #Handle OOV token
                Labels.append(1)
                
                Sequence_Index+=1
        if Longest_Sequence!=Max_Allowed_Sequence_Length:
            Longest_Sequence=len(Labels)
        Transcripts_Labels_Tokens[Transcript_Id,Transcript_Subset_Id]=(Labels,Tokens)
    
    Padded_Transcripts_Labels=numpy.array([Labels+[0]*(Longest_Sequence-len(Labels)) 
                                           for Labels in [Label_Token[0] 
                                                          for Label_Token in Transcripts_Labels_Tokens.values()]])
    Padded_Transcripts_Integers=numpy.array([Tokens+[0]*(Longest_Sequence-len(Tokens)) 
                                           for Tokens in [Label_Token[1] 
                                                          for Label_Token in Transcripts_Labels_Tokens.values()]])
    
    return(Padded_Transcripts_Labels,Padded_Transcripts_Integers,Longest_Sequence,Transcripts_Labels_Tokens)

In [10]:
Transcripts_Labels_Array,Transcripts_Integers_Array,Longest_Sequence,_ = Training_Text_To_Sequences(
    Transcripts,
    Corpus_Token_Index,
    Max_Allowed_Sequence_Length)
#following is needed by prediction program
import pickle
with open('Sentence_Restoration_Variables.pickle','wb') as Sentence_Restoration_Variables:
    pickle.dump([LSTM_Units,Longest_Sequence,Target_Label_Weights],Sentence_Restoration_Variables)

In [11]:
#almost entirely (but rewritten for clarity) from https://stackoverflow.com/a/71265729/6147425    
def Weighted_Loss(Target_Label_Weights):
    def innerLoss(Actual, Predicted):
        import tensorflow
        return tensorflow.reshape(tensorflow.gather(Target_Label_Weights, Actual), (-1,Longest_Sequence)) \
               * \
         tensorflow.keras.losses.SparseCategoricalCrossentropy(reduction=tensorflow.keras.losses.Reduction.NONE)(Actual,Predicted)
    return innerLoss

In [12]:
def Build_Model(LSTM_Units,Longest_Sequence,GloVe_Corpus_Index,Corpus_Token_Index):
    import tensorflow
    Model=tensorflow.keras.models.Sequential(name='BiLSTM_GloVe_Model')
    Model.add(tensorflow.keras.Input(shape=(Longest_Sequence,), dtype='int32',name='Input'))
    Model.add(tensorflow.keras.layers.Embedding(input_dim=len(Corpus_Token_Index) + 2,
                                      output_dim=300,
                                      embeddings_initializer=tensorflow.keras.initializers.Constant(GloVe_Corpus_Index),
                                      input_length=Longest_Sequence,
                                      mask_zero=True,
                                      name='GloVe_300_Dim',
                                      trainable=False))
    Model.add(tensorflow.keras.layers.Bidirectional(layer=tensorflow.keras.layers.LSTM(units=LSTM_Units,
                                                                                       return_sequences=True,
                                                                                       activation="tanh",
                                                                                       recurrent_activation="sigmoid",
                                                                                       recurrent_dropout=0.0,
                                                                                       unroll=False,
                                                                                       use_bias=True
                                                                                      )
                                                    ,name='LSTM_'+str(LSTM_Units)+'_Seq_1'))
    Model.add(tensorflow.keras.layers.Bidirectional(layer=tensorflow.keras.layers.LSTM(units=LSTM_Units,
                                                                                       return_sequences=True,
                                                                                       activation="tanh",
                                                                                       recurrent_activation="sigmoid",
                                                                                       recurrent_dropout=0.0,
                                                                                       unroll=False,
                                                                                       use_bias=True
                                                                                      )
                                                    ,name='LSTM_'+str(LSTM_Units)+'_Seq_2'))
    Model.add(tensorflow.keras.layers.Bidirectional(layer=tensorflow.keras.layers.LSTM(units=LSTM_Units,
                                                                                       return_sequences=True,
                                                                                       activation="tanh",
                                                                                       recurrent_activation="sigmoid",
                                                                                       recurrent_dropout=0.0,
                                                                                       unroll=False,
                                                                                       use_bias=True
                                                                                      )
                                                    ,name='LSTM_'+str(LSTM_Units)+'_Seq_3'))
    Model.add(tensorflow.keras.layers.Dropout(rate=.1,name='Dropout_.1'))
    Model.add(tensorflow.keras.layers.Dense(units=3,
                                                                kernel_initializer='normal',
                                                                activation='sigmoid',
                                                                name='Dense'))
    Model.compile(loss=Weighted_Loss(Target_Label_Weights=[0,.5/7.8,.5/1]),optimizer='adam')
    Model.save_weights('Temp_Save_Weights.keras')
    print(Model.summary())
    return Model


In [13]:
Model=Build_Model(LSTM_Units,Longest_Sequence,GloVe_Corpus_Index,Corpus_Token_Index)

Model: "BiLSTM_GloVe_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 GloVe_300_Dim (Embedding)   (None, 512, 300)          21021600  
                                                                 
 LSTM_256_Seq_1 (Bidirection  (None, 512, 512)         1140736   
 al)                                                             
                                                                 
 LSTM_256_Seq_2 (Bidirection  (None, 512, 512)         1574912   
 al)                                                             
                                                                 
 LSTM_256_Seq_3 (Bidirection  (None, 512, 512)         1574912   
 al)                                                             
                                                                 
 Dropout_.1 (Dropout)        (None, 512, 512)          0         
                                                

In [14]:
def Train_Model(Model,LSTM_Units,Transcripts_Integers_Array,Transcripts_Labels_Array,Longest_Sequence):
    import sklearn,statistics,numpy,itertools,math,tensorflow
    Best_Epochs_for_each_split = list()
    F1_for_each_split = list()
    for Cross_Validation_Iteration,(train_index, test_index) in enumerate(
                sklearn.model_selection.KFold(n_splits=3,shuffle = True,random_state=42
                    ).split(Transcripts_Integers_Array,Transcripts_Labels_Array)):
        print('Iteration',Cross_Validation_Iteration+1,'of 3')
        Model.load_weights('Temp_Save_Weights.keras')
        Training_History=Model.fit(x=Transcripts_Integers_Array[train_index],
                           y=Transcripts_Labels_Array[train_index],
                           validation_data=(Transcripts_Integers_Array[test_index], Transcripts_Labels_Array[test_index]),
                           verbose=2,
                           epochs=40, #actual epochs may be reduced by EarlyStopping
                           steps_per_epoch = len(Transcripts_Labels_Array[train_index]) // 8,
                           validation_steps = len(Transcripts_Labels_Array[test_index]) // 8,
                           batch_size=8,  
                           callbacks=[tensorflow.keras.callbacks.EarlyStopping(monitor="val_loss",
                                                                              min_delta=0.0001,
                                                                              patience=2,
                                                                              verbose=1,
                                                                              mode="min",
                                                                              restore_best_weights=False),
                                     tensorflow.keras.callbacks.ModelCheckpoint(
                                         filepath="Restore_Sentence_"+str(LSTM_Units)+"unit_Triple_BiLSTM_"\
                                             +str(Longest_Sequence)+"MaxToken_KFold_"+str(Cross_Validation_Iteration+1)+".keras",
                                         monitor='val_loss',
                                         save_weights_only=True,
                                         verbose=1,
                                         options = tensorflow.train.CheckpointOptions(experimental_enable_async_checkpoint=True),
                                         save_best_only=True,
                                         mode='min')])
        print('Model Fit Done')

        Best_Epochs_for_each_split.append(1+float(numpy.argmin(Training_History.history['val_loss'])))

        Predicted_Classifications = numpy.argmax(Model.predict(x=Transcripts_Integers_Array[test_index]), axis=-1)
        Predicted_Classifications_With_Padding_Info=list(zip(list(itertools.chain(*Predicted_Classifications.tolist())),
                                                             list(itertools.chain(*Transcripts_Integers_Array[test_index].tolist()))))
        True_Classifications_With_Padding_Info=list(zip(list(itertools.chain(*Transcripts_Labels_Array[test_index].tolist())),
                                                        list(itertools.chain(*Transcripts_Integers_Array[test_index].tolist()))))

        #Model may predict a non-pad is a pad, rare, but it happens so manually correct that until a better way is found
        F1=sklearn.metrics.f1_score(y_true=[Token_Label[0] 
                                                for Token_Label in True_Classifications_With_Padding_Info 
                                                                if Token_Label[1]!=0],
                                    y_pred=[Token_Label[0] if Token_Label[0]!=0 else 1 
                                                for Token_Label in Predicted_Classifications_With_Padding_Info 
                                                                if Token_Label[1]!=0])
        print(F1)
        F1_for_each_split.append(F1)


    print(F1_for_each_split)
    #Assuming F1 for each kfold split is similar take the epoch number from the best one, tr
    # and compute final fit model using all data
    Model.load_weights('Temp_Save_Weights.keras')
    Model.fit(x=Transcripts_Integers_Array,
              y=Transcripts_Labels_Array,
              epochs=math.ceil(statistics.mean(Best_Epochs_for_each_split)),
              batch_size=8,
              verbose=2,
              steps_per_epoch = len(Transcripts_Labels_Array) // 8
             )
    Model.save('Restore_Sentence_'+str(LSTM_Units)+'_unit_Triple_BiLSTM_'+str(Longest_Sequence)+'MaxToken.keras')

In [15]:
Train_Model(Model,LSTM_Units,Transcripts_Integers_Array,Transcripts_Labels_Array,Longest_Sequence)

Iteration 1 of 3
Epoch 1/40

Epoch 1: val_loss improved from inf to 0.01601, saving model to Restore_Sentence_256unit_Triple_BiLSTM_512MaxToken_KFold_1.keras
1349/1349 - 307s - loss: 0.0212 - val_loss: 0.0160 - 307s/epoch - 228ms/step
Epoch 2/40

Epoch 2: val_loss improved from 0.01601 to 0.01414, saving model to Restore_Sentence_256unit_Triple_BiLSTM_512MaxToken_KFold_1.keras
1349/1349 - 325s - loss: 0.0143 - val_loss: 0.0141 - 325s/epoch - 241ms/step
Epoch 3/40

Epoch 3: val_loss improved from 0.01414 to 0.01353, saving model to Restore_Sentence_256unit_Triple_BiLSTM_512MaxToken_KFold_1.keras
1349/1349 - 1591s - loss: 0.0120 - val_loss: 0.0135 - 1591s/epoch - 1s/step
Epoch 4/40

Epoch 4: val_loss improved from 0.01353 to 0.01329, saving model to Restore_Sentence_256unit_Triple_BiLSTM_512MaxToken_KFold_1.keras
1349/1349 - 9674s - loss: 0.0101 - val_loss: 0.0133 - 9674s/epoch - 7s/step
Epoch 5/40

Epoch 5: val_loss did not improve from 0.01329
1349/1349 - 449s - loss: 0.0085 - val_loss