In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
import csv, pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input, Conv1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import load_model

## Utility Fucntions

In [2]:
def load_data(path):
    with open(path, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            temp = row
            break
        
        if row[0].lower() != 'labels' or row[1].lower() != 'sentences':            
            print("ERROR: PLZ NAME THE FIRST ROW 'labels' and 'sentences'")
            return
                
        df = pd.read_csv(path)    
        return df


def count_words(features):
    counter = Counter()
    maximum = 0
    
    for sentence in features:
        maximum = max(maximum, len(sentence))
        
        for word in sentence: 
            counter[word] += 1
            
    return maximum, counter


def filter_func(temp):
    
    stop = set(stopwords.words("english"))
    
    temp = temp.lower()
    temp = temp.split()
    temp = [
        element
        for element in temp
        if element not in stop
    ]
    return temp

filter_func = np.vectorize(filter_func, otypes=[list])    


def shuffle(features, labels):
    
    assert labels.shape[0] == features.shape[0]

    idx = np.arange(labels.shape[0])
    np.random.shuffle(idx)
    
    return features[idx], labels[idx]


def convert_labels(labels):
    index = {
        "purpose"      : [1, 0, 0, 0, 0, 0], 
        "craftsmanship": [0, 1, 0, 0, 0, 0],  
        "aesthetic"    : [0, 0, 1, 0, 0, 0],
        "narative"     : [0, 0, 0, 1, 0, 0],
        "influence"    : [0, 0, 0, 0, 1, 0],
        "none"         : [0, 0, 0, 0, 0, 1]        
    }
    return np.array([
        index[e] 
        for e in labels
    ])

In [3]:
path = "Panic Consolidated Training Phrases - Sheet1.csv"

In [4]:
#Loading the data from csv file
df = load_data(path)
labels   = df['labels'].to_numpy()
features = df['sentences'].to_numpy()

In [5]:
classes = ['purpose', 'craftsmaship', 'aesthetic', "narative", "influence", "none"]
total_samples = len(df)

In [6]:
#Cleaning stop words and converting to lists
features = filter_func(features)

In [7]:
#shuffling the data
rand_features, rand_labels = shuffle(features, labels)

In [8]:
#Imp numbers to create Embeddings and for padding
maxlen, count = count_words(features)
num_words = len(count)
maxlen = maxlen - 20

print(f"Total Number of unique words after filtering are {num_words} and the longest sentence is {maxlen} words")

Total Number of unique words after filtering are 3126 and the longest sentence is 50 words


In [9]:
#Train Test Split
ratio = 0.9
mark  = int(total_samples*ratio)

train = (rand_features[:mark], rand_labels[:mark])
test  = (rand_features[mark:], rand_labels[mark:])

In [10]:
#One hot encoding Labels
train_labels = convert_labels(train[1])
test_labels = convert_labels(test[1])

### Tokenizing the data

In [82]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(rand_features)

In [None]:
#Saving the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [83]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()

3126


In [13]:
#word_index

In [14]:
train_sequences = tokenizer.texts_to_sequences(train[0])
test_sequences  = tokenizer.texts_to_sequences(test[0])

In [15]:
train_sequences[0], test_sequences[0]

([107, 625, 1097, 21], [3, 302, 259, 2714, 2817, 2680, 1, 25])

In [16]:
train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post', truncating='post')

### Creating the Model

In [4]:
#maxlen = 40
#num_words = 3126
model = Sequential([
    Input(shape=maxlen),
    Embedding(num_words, 40),
    LSTM(64, dropout=0.1, return_sequences=True),
    LSTM(64, dropout=0.1),
    Dense(128, activation='relu'),
    Dense(6, activation='softmax')
])

model.compile(optimizer=Adam(0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [145]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 40)            125040    
_________________________________________________________________
lstm_12 (LSTM)               (None, 50, 64)            26880     
_________________________________________________________________
lstm_13 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dense_12 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_13 (Dense)             (None, 6)                 774       
Total params: 194,038
Trainable params: 194,038
Non-trainable params: 0
_________________________________________________________________


### Training the Model

In [146]:
model.fit(train_padded, train_labels, epochs=30, validation_data=(test_padded, test_labels))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1ee20b6d490>

In [5]:
#Saving the model
model.save("scratch_model.h5")

In [52]:
model = load_model("third -  loss 0.3476 accuracy0.8954 val_loss1.2306 val_accuracy0.61.h5")

In [53]:
test_sen = "the smudge-proof formula comes in a deep rich black color from plant derived pigments of eclipta daisy cranberry turmeric basil & henna"

In [54]:
test_seq = tokenizer.texts_to_sequences([test_sen])
test_pad = pad_sequences(test_seq, maxlen=maxlen, padding='post', truncating='post')

In [55]:
test_pad

array([[2297,  124,  518,  584, 1041,  162,  960, 2298, 2299, 2300, 2301,
        2302, 2303, 2304, 2305,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0]])

In [56]:
model.predict(test_pad)

array([[3.0365172e-01, 1.2741041e-02, 6.8050086e-01, 1.2446828e-03,
        3.0549688e-04, 1.5562468e-03]], dtype=float32)

In [71]:
tokenizer1 = Tokenizer(num_words=num_words)

In [72]:
tokenizer1.fit_on_texts(rand_features[:400])

tokenizer1.fit_on_texts(rand_features[400:])

In [73]:
tokenizer2 = Tokenizer(num_words=num_words)

In [74]:
tokenizer2.fit_on_texts(rand_features)

In [60]:
len(tokenizer.word_index)

2925

In [61]:
tokenizer123.fit_on_texts(rand_features[400:])

In [63]:
tokenizer123.word_index

{'quality': 1,
 'people': 2,
 'craftsmanship': 3,
 'make': 4,
 'life': 5,
 'world': 6,
 'work': 7,
 'one': 8,
 'best': 9,
 'every': 10,
 'love': 11,
 'like': 12,
 'care': 13,
 'beauty': 14,
 'create': 15,
 'art': 16,
 'lives': 17,
 'sunset': 18,
 'time': 19,
 'help': 20,
 'provide': 21,
 'sun': 22,
 'beautiful': 23,
 'human': 24,
 'even': 25,
 'products': 26,
 'new': 27,
 'knowledge': 28,
 'see': 29,
 'nature': 30,
 'good': 31,
 'customers': 32,
 'happens': 33,
 'always': 34,
 'better': 35,
 'health': 36,
 'inspire': 37,
 'use': 38,
 'business': 39,
 'find': 40,
 'others': 41,
 'little': 42,
 'company': 43,
 'improve': 44,
 'services': 45,
 'well': 46,
 'without': 47,
 'natural': 48,
 'communities': 49,
 'research': 50,
 'education': 51,
 'value': 52,
 'way': 53,
 'worlds': 54,
 'long': 55,
 'summer': 56,
 'still': 57,
 'around': 58,
 'power': 59,
 'things': 60,
 'skin': 61,
 'service': 62,
 'get': 63,
 'know': 64,
 'community': 65,
 'many': 66,
 'cancer': 67,
 'great': 68,
 'possible'

In [75]:
tokenizer1

<keras_preprocessing.text.Tokenizer at 0x1d116aa9310>

In [76]:
tokenizer2

<keras_preprocessing.text.Tokenizer at 0x1d115494ac0>

In [78]:
count = 0
for t1, t2 in zip(tokenizer1.word_index.items(), tokenizer2.word_index.items()):
    count += 1
    if t1 != t2: break

print(count)

3126


In [81]:
len(tokenizer2.word_index)

3126

In [84]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [87]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [88]:
type(tokenizer)

keras_preprocessing.text.Tokenizer

In [102]:
a = 10

def aabc():
    a += 10
    
aabc()

a

UnboundLocalError: local variable 'a' referenced before assignment

20