In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
from itertools import chain
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *

In [None]:
from tensorflow.keras import Input
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer,one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tqdm.pandas()

In [None]:
data = pd.read_pickle("/Users/s0c02nj/Downloads/cnn/dataframe_extractive.pkl")

In [None]:
data['story_id'].nunique()

In [None]:
list_stories_subset = list(data['story_id'].sample(5000))

In [None]:
data_sub = data[data['story_id'].isin(list_stories_subset)]

In [None]:
data_sub.head(20)

In [None]:
#Basic Analysis

#### Sentence_Length Max Length

In [None]:
d1_sent_count = data_sub.groupby('story_id').size().reset_index(name='count_sentences')

In [None]:
sns.distplot(d1_sent_count['count_sentences'])

In [None]:
d1_sent_count['count_sentences'].describe()

In [None]:
max_len = d1_sent_count['count_sentences'].max()
max_len

In [None]:
sent_ls = set(data_sub['sentence'].tolist())


In [None]:
tags = list(set(data_sub["label_sent"].values))
n_tags = len(tags); n_tags

In [None]:
#Creating the Dictionary and Inverse
sent2idx = {s: i + 2 for i, s in enumerate(sent_ls)}
sent2idx["UNK"] = 1
sent2idx["PAD"] = 0

idx2word = {i: s for s, i in sent2idx.items()}

tag2idx  = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 3
idx2tag = {i: s for s, i in tag2idx.items()}

In [None]:
tag2idx

#### Word Distribution and Max Length

In [None]:
data_sub['word_len'] = data_sub['sentence'].progress_apply(lambda x: len(x.split()))
data_sub['word_lis'] = data_sub['sentence'].progress_apply(lambda x: x.split())

In [None]:
data_sub.head()

In [None]:
sns.distplot(data_sub['word_len'])

In [None]:
data_sub['word_len'].describe()

#### Creating the Word_level_vocab

In [None]:
word_ls = set(list(chain(*data_sub['word_lis'].tolist())))
n_words = len(word_ls)
print(n_words)

In [None]:
word2idx = {c: i+2 for i,c in enumerate(word_ls)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0

In [None]:
max_len_word = 50

#### Formatting the Data for Modelling Format

In [None]:
data_sub['sent_lab'] = data_sub[['sentence','label_sent']].apply(tuple, axis=1)
data_sub.head()

In [None]:
stories_labels = []

for s_id in tqdm(list_stories_subset):
    
    #Initialising the list
    temp_story = []
    
    #Storing the temo
    temp_story = list(data_sub[data_sub['story_id'] == s_id]['sent_lab'])
    
    #Appending the final list in the sent label format
    stories_labels.append(temp_story)
    
    

#### Modelling Excersie

In [None]:
#Defining the word encoding

X_word = []
for story in tqdm(stories_labels):
    
    #Give the seq
    story_seq = []
    
    #To give an upper bound on the maximum length of the word sequence for sentence
    for i in range(max_len):
        sent_seq = []
        
        #to give an upper bound on the maximum length of words to consider
        for j in range(max_len_word):
            try:
                split_sent = story[i][0].split()
                sent_seq.append(word2idx.get(split_sent[j]))
            except:  
                #exception will be there when there will not be any sentence for the length and will be padded 0
                sent_seq.append(word2idx.get("PAD"))
        story_seq.append(sent_seq)
    X_word.append(np.array(story_seq))


#### Sample Checks

In [None]:
X_word[0].shape

In [None]:
stories_labels[0][0]

In [None]:
word2idx.get("the")

#### Preparing the Labels 

In [None]:
y = [[tag2idx[w[1]] for w in s] for s in stories_labels]

In [None]:
y = pad_sequences(maxlen = max_len, 
                  sequences = y, 
                  value = tag2idx["PAD"], 
                  padding = 'post', 
                  truncating = 'post')

#### Padding the sequences

In [None]:
from sklearn.model_selection import train_test_split


X_word_tr, X_word_te, _, _ = train_test_split(X_word, y, 
                                              test_size=0.2, 
                                              random_state=2018)

+ input_dim: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.


+ output_dim: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.


+ input_length: This is the length of input sequences, as you would define for any input layer of a Keras model. For example, if all of your input documents are comprised of 1000 words, this would be 1000.

In [None]:
#input and embeddings for words
word_in = Input(shape=(max_len, max_len_word,))

#Word_level embedding
emb_word = TimeDistributed(Embedding(input_dim = n_words +2, output_dim = 50,
                           input_length = max_len_word, mask_zero=True))(word_in)


# #word LSTM to get sent encodings by words
emb_sent = TimeDistributed(LSTM(units = 32, return_sequences=False,
                                recurrent_dropout=0.3))(emb_word)

# #main LSTM
# x = concatenate([emb_word, char_enc])

#x = SpatialDropout1D(0.3)(x)
main_lstm = Bidirectional(LSTM(units=64, return_sequences=True,
                               recurrent_dropout=0.3))(emb_sent)

out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)


model = Model([word_in], out)

In [None]:
model.summary()

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])


In [None]:
history = model.fit([np.array(X_word).reshape((len(X_word), max_len, max_len_word))],
                     np.array(y).reshape(len(y), max_len, 1),
                     batch_size=512, 
                     epochs=1, 
                     validation_split=0.1, 
                     verbose=1)