In [1]:
import numpy as np # linear algebra
import pandas as pd # df processing, CSV file I/O (e.g. pd.read_csv)

In [144]:
# Library import code from book
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Personal imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import time
import datetime

from torchvision import datasets
from torchvision import transforms
import torch.nn.functional as F

from keras.preprocessing import sequence
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, LSTM
from keras.layers import Conv1D, Flatten, MaxPooling1D, GlobalMaxPool1D
import wandb
from wandb.keras import WandbCallback
from keras.preprocessing import text

In [38]:
if torch.cuda.is_available():
    # Create a device object for the GPU
    device = torch.device('cuda')
else:
    # Create a device object for the CPU
    device = torch.device('cpu')
device

device(type='cuda')

In [153]:
!python -m wget http://nlp.stanford.edu/data/glove.6B.zip


Saved under glove.6B.zip
^C


## Loading the train, validation and test datasets

In [39]:
train = pd.read_csv("data/train.txt",names=['Input','Sentiment'],sep=';',encoding='utf-8')
val = pd.read_csv("data/val.txt",names=['Input','Sentiment'],sep=';',encoding='utf-8')
test = pd.read_csv("data/test.txt",names=['Input','Sentiment'],sep=';',encoding='utf-8')

In [40]:
# Combine training, validation, and test sets into one and make into DataFrame object
df = pd.concat([train,val,test],axis=0)
df = df.sample(frac=0.1)
df = df.reset_index()
df.head()

Unnamed: 0,index,Input,Sentiment
0,9293,i is thirteen again and so so unsure of himsel...,love
1,7302,i feel so super not old,joy
2,243,i feel like i am joining the masses which goes...,joy
3,10131,i feel a kind of sadness for the television sh...,joy
4,1490,i still feel crappy ill take it as a sign that...,sadness


In [98]:
encoded_labels = {'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5} 
encoded_anger = {'anger': 1, 'fear': 0, 'joy': 0, 'love': 0, 'sadness': 0, 'surprise': 0}
encoded_fear = {'anger': 0, 'fear': 1, 'joy': 0, 'love': 0, 'sadness': 0, 'surprise': 0}
encoded_joy = {'anger': 0, 'fear': 0, 'joy': 1, 'love': 0, 'sadness': 0, 'surprise': 0}
encoded_love = {'anger': 0, 'fear': 0, 'joy': 0, 'love': 1, 'sadness': 0, 'surprise': 0}
encoded_sadness = {'anger': 0, 'fear': 0, 'joy': 0, 'love': 0, 'sadness': 1, 'surprise': 0}
encoded_surprise = {'anger': 0, 'fear': 0, 'joy': 0, 'love': 0, 'sadness': 0, 'surprise': 1}

In [99]:
df['anger'] = df.Sentiment.map(encoded_anger)
df['fear'] = df.Sentiment.map(encoded_fear)
df['joy'] = df.Sentiment.map(encoded_joy)
df['love'] = df.Sentiment.map(encoded_love)
df['sadness'] = df.Sentiment.map(encoded_sadness)
df['surprise'] = df.Sentiment.map(encoded_surprise)
df.head()

Unnamed: 0,index,Input,Sentiment,anger,fear,joy,love,sadness,surprise
0,9293,i is thirteen again and so so unsure of himsel...,love,0,0,0,1,0,0
1,7302,i feel so super not old,joy,0,0,1,0,0,0
2,243,i feel like i am joining the masses which goes...,joy,0,0,1,0,0,0
3,10131,i feel a kind of sadness for the television sh...,joy,0,0,1,0,0,0
4,1490,i still feel crappy ill take it as a sign that...,sadness,0,0,0,0,1,0


## Train-Test Split

In [100]:
# Stratify: Split in a stratified fashion using the class labels, which in this case is under the column header 'Sentiment'
train_data, test_data = train_test_split(df,test_size=0.3,random_state=101,shuffle=True,stratify=df.Sentiment)

In [101]:
#train_data['Label'] = train_data.Sentiment.map(encoded_labels)
#test_data['Label'] = test_data.Sentiment.map(encoded_labels)
train_data.head()

Unnamed: 0,index,Input,Sentiment,anger,fear,joy,love,sadness,surprise
1309,2974,im really really sad that i missed the menswea...,joy,0,0,1,0,0,0
996,1252,i said in some recent interviews we will have ...,joy,0,0,1,0,0,0
565,11505,i am satisfied with the final installment and ...,sadness,0,0,0,0,1,0
1369,8632,i couldnt help but feel sincere gratitude for ...,joy,0,0,1,0,0,0
1906,9928,im feeling pressured because it is crunch time...,fear,0,1,0,0,0,0


In [102]:
test_data.head()

Unnamed: 0,index,Input,Sentiment,anger,fear,joy,love,sadness,surprise
617,11034,im feeling romantic towards not another relati...,love,0,0,0,1,0,0
678,1264,i cant dos that leave me feeling helpless,fear,0,1,0,0,0,0
26,15546,i would be feeling i am genuinely shocked and ...,surprise,0,0,0,0,0,1
1389,7438,i feel like i missed out not being born into a...,sadness,0,0,0,0,1,0
1111,10980,i feel their exuberance upon being accepted an...,joy,0,0,1,0,0,0


## Naive Training by converting text to:
+ Bag of Words: Translates a string to a vector that is the length of number of words in the string, with each index
    representing the number of times a word occurs in a string.
    + Problem: Losses order of words, which is critical for the english language
+ Character Encoding: Encodes the individual characters in a string
    + Problem: Losses concept of words, becomes a very generative approach for our purpose
+ Word Embedding: Transforms each word into number vectors for a given string. The embeddings can be calculated for
    your given dataset, but you could just use pre-computed embeddings.
    + GloVe + word2vec: Embeddings contain a mathematical sense to their numbers by implementing semantic information in the embedding process. For instance, [king] - [man] + [woman] = [queen]

Fixed size vectors means padding needs to be added to the strings to ensure all will be the same length in words

In [176]:
# set parameters:
vocab_size = 1000
maxlen = 300
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 128
epochs = 50

In [177]:
# Split to X and Y
labels = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
X_train = train_data['Input']
X_train.head()
Y_train = train_data[labels]
Y_train.head()
X_test = test_data['Input']
Y_test = test_data[labels]
Y_test.head()

Unnamed: 0,anger,fear,joy,love,sadness,surprise
617,0,0,0,1,0,0
678,0,1,0,0,0,0
26,0,0,0,0,0,1
1389,0,0,0,0,1,0
1111,0,0,1,0,0,0


In [178]:
# Tokenizer: Ensures inputs that are longer than max vocab size are removed to ensure same size vectors
tokenizer = text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_matrix(X_train)
X_test = tokenizer.texts_to_matrix(X_test)

In [179]:
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [180]:
embeddings_index = dict()
f = open('glove/glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [181]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocab_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [182]:
# Model
model = Sequential()
model.add(Embedding(vocab_size,
                    100,
                    input_length=maxlen,
                    weights=[embedding_matrix],
                    trainable=False))
model.add(Dropout(0.2))
model.add(Conv1D(128,
                 kernel_size,
                 padding='same',
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(64,
                 kernel_size,
                 padding='same',
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))
#model.add(Flatten())
#model.add(Dense(hidden_dims, activation='relu'))
model.add(Dropout(0.2))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(len(labels), activation='sigmoid'))
model.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, 300, 100)          100000    
                                                                 
 dropout_36 (Dropout)        (None, 300, 100)          0         
                                                                 
 conv1d_36 (Conv1D)          (None, 300, 128)          38528     
                                                                 
 max_pooling1d_36 (MaxPoolin  (None, 150, 128)         0         
 g1D)                                                            
                                                                 
 conv1d_37 (Conv1D)          (None, 150, 64)           24640     
                                                                 
 max_pooling1d_37 (MaxPoolin  (None, 75, 64)           0         
 g1D)                                                

In [188]:
model.compile(loss='categorical_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

In [189]:
model.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, Y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
 7/44 [===>..........................] - ETA: 0s - loss: 1.5517 - accuracy: 0.3527

KeyboardInterrupt: 

In [187]:
model.predict("Sad")

IndexError: tuple index out of range