# Positive Tone Data Generator

This notebook contains code for an MLP neural network that generates positively toned data based on the dataset. Feature vectors are then made from the positively toned data and generated data then saved into .csv files.

### Import Libraries

In [1]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import pandas as pd
from gensim.models import Word2Vec
from nltk import regexp_tokenize, WordNetLemmatizer
import random

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Input

In [3]:
df_raw = pd.read_csv('poem_sentiment.csv', header=None, index_col=0, names=['Text', 'Sentiment'])
df_raw

Unnamed: 0,Text,Sentiment
0,with pale blue berries. in these peaceful shad...,1.0
1,it flows so long as falls the rain,0.0
2,"and that is why, the lonesome day",-1.0
3,"when i peruse the conquered fame of heroes, an...",2.0
4,of inward strife for truth and liberty.,2.0
...,...,...
887,to his ears there came a murmur of far seas be...,0.0
888,"the one good man in the world who knows me, --",1.0
889,faint voices lifted shrill with pain,-1.0
890,"an', fust you knowed on, back come charles the...",0.0


In [4]:
positive = df_raw[df_raw['Sentiment'] > 0]
positive

Unnamed: 0,Text,Sentiment
0,with pale blue berries. in these peaceful shad...,1.0
3,"when i peruse the conquered fame of heroes, an...",2.0
4,of inward strife for truth and liberty.,2.0
5,the red sword sealed their vows!,2.0
16,that has a charmingly bourbon air.,1.0
...,...,...
870,their first-born brother as a god.,1.0
876,and so i should be loved and mourned to-night.,2.0
877,"and _channing_, with his bland, superior look",2.0
884,"how your soft opera-music changed, and the dru...",2.0


### Text Preprocessing

In [5]:
raw_text = positive['Text'].to_string(index=False)
raw_text[:1000]

'with pale blue berries. in these peaceful shades--\nwhen i peruse the conquered fame of heroes, and...\n           of inward strife for truth and liberty.\n                  the red sword sealed their vows!\n                that has a charmingly bourbon air.\n          brightly expressive as the twins of leda\n               in monumental pomp! no grecian drop\n                    the hostile cohorts melt away;\nand lips where heavenly smiles would hang and b...\n                         honour to the bugle-horn!\n                       if the pure and holy angels\n        upon the thought of perfect noon. and when\n      thy hands all cunning arts that women prize.\n             reasoning to admiration, and with mee\n           it shines superior on a throne of gold:\n    take the warm welcome of new friends with thee\n                  augmented, sweet, a hundred fold\n                every day a rich reward will give;\n                                 gay little heart!\n         am

In [6]:
# Remove all non-ASCII characters
processed_text = re.sub(r'[^\x00-\x7f]', r'', raw_text).lower()
processed_text[:1000]

'with pale blue berries. in these peaceful shades--\nwhen i peruse the conquered fame of heroes, and...\n           of inward strife for truth and liberty.\n                  the red sword sealed their vows!\n                that has a charmingly bourbon air.\n          brightly expressive as the twins of leda\n               in monumental pomp! no grecian drop\n                    the hostile cohorts melt away;\nand lips where heavenly smiles would hang and b...\n                         honour to the bugle-horn!\n                       if the pure and holy angels\n        upon the thought of perfect noon. and when\n      thy hands all cunning arts that women prize.\n             reasoning to admiration, and with mee\n           it shines superior on a throne of gold:\n    take the warm welcome of new friends with thee\n                  augmented, sweet, a hundred fold\n                every day a rich reward will give;\n                                 gay little heart!\n         am

### Tokenization

In [7]:
# Get word tokens from text
word_tokens = regexp_tokenize(processed_text, pattern=r'[^\S\r]+|[\.,:;!?()--_"]', gaps=True)
print(f"Number of word tokens: {len(word_tokens)}")
word_tokens[:10]

Number of word tokens: 1292


['with',
 'pale',
 'blue',
 'berries',
 'in',
 'these',
 'peaceful',
 'shades',
 'when',
 'i']

In [8]:
# Lemmatization done to make uncommon words more likely to be recognized by 
# Word2Vec model later when converting to feature vectors
lemmatizer = WordNetLemmatizer()
word_tokens = [lemmatizer.lemmatize(token) for token in word_tokens] # Lemmatize nouns
word_tokens = [lemmatizer.lemmatize(token, 'v') for token in word_tokens] # Lemmatize verbs

In [9]:
# Get unique word tokens from word tokens
unique_words = sorted(list(set(word_tokens)))
print(f"Number of unique word tokens: {len(unique_words)}")

Number of unique word tokens: 628


In [10]:
# Create vocabulary of word tokens
word_vocabulary = unique_words
word_vocabulary[:10]

["'tis",
 'a',
 'abide',
 'abloom',
 'about',
 'accordance',
 'adam',
 'adept',
 'admiration',
 'after']

### Create word-index mappings

In [11]:
# Create index-word mappings 
indices_words = dict((index, word) for index, word in enumerate(unique_words))
indices_words

{0: "'tis",
 1: 'a',
 2: 'abide',
 3: 'abloom',
 4: 'about',
 5: 'accordance',
 6: 'adam',
 7: 'adept',
 8: 'admiration',
 9: 'after',
 10: 'again',
 11: 'ah',
 12: 'air',
 13: 'all',
 14: 'already',
 15: 'amidst',
 16: 'among',
 17: 'an',
 18: 'and',
 19: 'angel',
 20: 'angry',
 21: 'arm',
 22: 'around',
 23: 'art',
 24: 'ascend',
 25: 'ash',
 26: 'aspire',
 27: 'assay',
 28: 'augment',
 29: 'away',
 30: 'awe',
 31: 'ay',
 32: 'b',
 33: 'bare',
 34: 'be',
 35: 'bear',
 36: 'beauteous',
 37: 'beautiful',
 38: 'beauty',
 39: "beauty'",
 40: 'because',
 41: 'before',
 42: 'bell',
 43: 'bend',
 44: 'beneath',
 45: 'berry',
 46: 'best',
 47: 'betray',
 48: 'between',
 49: 'blade',
 50: 'bland',
 51: 'blaze',
 52: 'bless',
 53: 'blind',
 54: 'blue',
 55: 'bolt',
 56: 'borrow',
 57: 'boston',
 58: 'bourbon',
 59: 'bow',
 60: 'brave',
 61: 'braver',
 62: 'breast',
 63: 'bright',
 64: 'brightly',
 65: 'brilliant',
 66: 'brother',
 67: "brynhilda's",
 68: 'bugle',
 69: 'burn',
 70: 'burst',
 71

In [12]:
# Create word-index mappings
word_indices = dict((word, index) for index, word in enumerate(unique_words))
word_indices

{"'tis": 0,
 'a': 1,
 'abide': 2,
 'abloom': 3,
 'about': 4,
 'accordance': 5,
 'adam': 6,
 'adept': 7,
 'admiration': 8,
 'after': 9,
 'again': 10,
 'ah': 11,
 'air': 12,
 'all': 13,
 'already': 14,
 'amidst': 15,
 'among': 16,
 'an': 17,
 'and': 18,
 'angel': 19,
 'angry': 20,
 'arm': 21,
 'around': 22,
 'art': 23,
 'ascend': 24,
 'ash': 25,
 'aspire': 26,
 'assay': 27,
 'augment': 28,
 'away': 29,
 'awe': 30,
 'ay': 31,
 'b': 32,
 'bare': 33,
 'be': 34,
 'bear': 35,
 'beauteous': 36,
 'beautiful': 37,
 'beauty': 38,
 "beauty'": 39,
 'because': 40,
 'before': 41,
 'bell': 42,
 'bend': 43,
 'beneath': 44,
 'berry': 45,
 'best': 46,
 'betray': 47,
 'between': 48,
 'blade': 49,
 'bland': 50,
 'blaze': 51,
 'bless': 52,
 'blind': 53,
 'blue': 54,
 'bolt': 55,
 'borrow': 56,
 'boston': 57,
 'bourbon': 58,
 'bow': 59,
 'brave': 60,
 'braver': 61,
 'breast': 62,
 'bright': 63,
 'brightly': 64,
 'brilliant': 65,
 'brother': 66,
 "brynhilda's": 67,
 'bugle': 68,
 'burn': 69,
 'burst': 70,
 'b

### Create Blocks

In [13]:
# Create x (input): Split text into blocks, where each block has the same amount of words
# Create y (targets): For each x input, the y is the word that comes next
# The model should learn to predict y from the input x

block_size = 2
step = 1

x = []
y = []

for i in range(0, len(word_tokens) - block_size, step):
    x.append(word_tokens[i: i+block_size])
    y.append(word_tokens[i + block_size])

In [14]:
# Inspect x
x[:5]

[['with', 'pale'],
 ['pale', 'blue'],
 ['blue', 'berry'],
 ['berry', 'in'],
 ['in', 'these']]

In [15]:
# Check number of blocks
len(x)

1290

### Create One-Hot Encoding

In [16]:
# Create one-hot encoding of x
x_encoded = []

for x_arr in x:
    x_ints = [word_indices[item] for item in x_arr]
    
    x_row = []
    for item in x_ints:
        x_vector = np.zeros(len(unique_words))
        x_vector[item] = 1
        x_row.append(x_vector)
        
    x_encoded.append(x_row)
    
x_encoded = np.array(x_encoded)

In [17]:
# Inspect y
y[:5]

['blue', 'berry', 'in', 'these', 'peaceful']

In [18]:
# Convert each word in y into their corresponding indices
y_ints = [word_indices[item] for item in y]
y_ints[:5]

[54, 45, 257, 539, 386]

In [19]:
# Create one-hot encoding of y
y_encoded = []

for item in y_ints:
    y_vector = np.zeros(len(unique_words))
    y_vector[item] = 1
    y_encoded.append(y_vector)

y_encoded = np.array(y_encoded)

### Model Configuration

In [20]:
class TextGenerator(nn.Module):
    def __init__(self, input_dim, output_dim, block_size):
        super().__init__()

        self.embeddings = nn.Linear(input_dim, 2000)
        self.hidden = nn.Linear(2000, 1200)
        self.output = nn.Linear(1200, output_dim)
        
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.sigmoid(self.embeddings(x))
        x = self.tanh(self.hidden(x))
        x = self.softmax(self.output(x))

        return x

In [21]:
# Get size of input for training the model
input_size = x_encoded[0].ravel().shape[0]
print(x_encoded[0].ravel().shape[0])

1256


In [22]:
# Allocate tensors to the device used for computation
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Performing torch operations on {device} device")

# Create x and y PyTorch tensors
x = torch.tensor(x_encoded).float().to(device)
y = torch.tensor(y_encoded).float().to(device)

Performing torch operations on cuda device


In [23]:
# Instantiate model
model = TextGenerator(input_size, len(unique_words), block_size).to(device)

# Print model configuration
model

TextGenerator(
  (embeddings): Linear(in_features=1256, out_features=2000, bias=True)
  (hidden): Linear(in_features=2000, out_features=1200, bias=True)
  (output): Linear(in_features=1200, out_features=628, bias=True)
  (tanh): Tanh()
  (sigmoid): Sigmoid()
  (softmax): Softmax(dim=1)
)

In [24]:
# Define model optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.000000001)
criterion = nn.CrossEntropyLoss()

### Create Dataset & DataLoader

In [25]:
# Create custom Dataset class
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index].ravel(), self.y[index]
    
    def __len__(self):
        return self.n_samples

In [26]:
# Create training dataset using custom Dataset class
training_ds = CustomDataset(x, y)

In [27]:
# Load training dataset into DataLoader
from torch.utils.data import DataLoader

batch_size = 10

train_loader = DataLoader(
    training_ds,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

### Train Model

In [28]:
# Define function to train model
def train_fn(loader, model, optimizer, loss_fn, device):
    loop = tqdm(loader)

    ave_loss = 0
    count = 0 
    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)
        
        # Forward
        predictions = model.forward(data)
        loss = loss_fn(predictions, targets)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update tqdm loading bar
        loop.set_postfix(loss=loss.item())

        count += 1
        ave_loss += loss.item()
    
    ave_loss = ave_loss / count

    return ave_loss

In [29]:
# Train model
epochs = 100
average_losses = []

for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    ave_loss = train_fn(train_loader, model, optimizer, criterion, device)
    
    print("Ave Loss: {}".format(ave_loss))
    average_losses.append(ave_loss)

Epoch: 0


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 43.49it/s, loss=6.44]


Ave Loss: 6.442569477613582
Epoch: 1


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 124.23it/s, loss=6.44]


Ave Loss: 6.442569403685341
Epoch: 2


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 125.22it/s, loss=6.44]


Ave Loss: 6.4425693186678625
Epoch: 3


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 125.11it/s, loss=6.44]


Ave Loss: 6.4425692595252695
Epoch: 4


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 126.30it/s, loss=6.44]


Ave Loss: 6.442569215168325
Epoch: 5


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 123.67it/s, loss=6.44]


Ave Loss: 6.442569163418556
Epoch: 6


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 126.01it/s, loss=6.44]


Ave Loss: 6.442569115365198
Epoch: 7


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 125.43it/s, loss=6.44]


Ave Loss: 6.442569063615429
Epoch: 8


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 118.87it/s, loss=6.44]


Ave Loss: 6.442569037740545
Epoch: 9


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 112.57it/s, loss=6.44]


Ave Loss: 6.442569004472836
Epoch: 10


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 111.77it/s, loss=6.44]


Ave Loss: 6.442568978597952
Epoch: 11


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 116.26it/s, loss=6.44]


Ave Loss: 6.442568926848183
Epoch: 12


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 118.53it/s, loss=6.44]


Ave Loss: 6.44256883443788
Epoch: 13


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 121.02it/s, loss=6.44]


Ave Loss: 6.442568782688111
Epoch: 14


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 122.97it/s, loss=6.44]


Ave Loss: 6.442568756813227
Epoch: 15


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 118.04it/s, loss=6.44]


Ave Loss: 6.44256869027781
Epoch: 16


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 119.20it/s, loss=6.44]


Ave Loss: 6.4425686570101
Epoch: 17


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 119.39it/s, loss=6.44]


Ave Loss: 6.442568575689035
Epoch: 18


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 120.21it/s, loss=6.44]


Ave Loss: 6.442568546117738
Epoch: 19


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 118.96it/s, loss=6.44]


Ave Loss: 6.442568527635678
Epoch: 20


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 112.64it/s, loss=6.44]


Ave Loss: 6.4425684426182
Epoch: 21


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 113.68it/s, loss=6.44]


Ave Loss: 6.442568390868431
Epoch: 22


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 114.51it/s, loss=6.44]


Ave Loss: 6.442568364993546
Epoch: 23


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 120.70it/s, loss=6.44]


Ave Loss: 6.442568298458129
Epoch: 24


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 120.45it/s, loss=6.44]


Ave Loss: 6.442568261494008
Epoch: 25


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 112.25it/s, loss=6.44]


Ave Loss: 6.442568198655003
Epoch: 26


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 112.87it/s, loss=6.44]


Ave Loss: 6.442568117333937
Epoch: 27


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 96.55it/s, loss=6.44]


Ave Loss: 6.442568080369816
Epoch: 28


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 101.18it/s, loss=6.44]


Ave Loss: 6.4425680397092835
Epoch: 29


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 108.57it/s, loss=6.44]


Ave Loss: 6.442567973173866
Epoch: 30


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 109.56it/s, loss=6.44]


Ave Loss: 6.442567928816921
Epoch: 31


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 110.88it/s, loss=6.44]


Ave Loss: 6.442567888156388
Epoch: 32


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 111.16it/s, loss=6.44]


Ave Loss: 6.442567832710207
Epoch: 33


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 113.05it/s, loss=6.44]


Ave Loss: 6.442567777264026
Epoch: 34


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 113.40it/s, loss=6.44]


Ave Loss: 6.442567718121433
Epoch: 35


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 113.12it/s, loss=6.44]


Ave Loss: 6.4425676294075425
Epoch: 36


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 111.45it/s, loss=6.44]


Ave Loss: 6.442567544390065
Epoch: 37


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 113.32it/s, loss=6.44]


Ave Loss: 6.442567525908005
Epoch: 38


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 112.36it/s, loss=6.44]


Ave Loss: 6.442567444586938
Epoch: 39


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 113.25it/s, loss=6.44]


Ave Loss: 6.442567403926406
Epoch: 40


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 110.80it/s, loss=6.44]


Ave Loss: 6.442567355873049
Epoch: 41


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 113.35it/s, loss=6.44]


Ave Loss: 6.442567300426868
Epoch: 42


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 113.39it/s, loss=6.44]


Ave Loss: 6.442567226498626
Epoch: 43


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 110.73it/s, loss=6.44]


Ave Loss: 6.442567141481148
Epoch: 44


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 73.22it/s, loss=6.44]


Ave Loss: 6.4425671119098515
Epoch: 45


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 57.21it/s, loss=6.44]


Ave Loss: 6.442567078642143
Epoch: 46


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 97.62it/s, loss=6.44]


Ave Loss: 6.442567026892374
Epoch: 47


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 86.53it/s, loss=6.44]


Ave Loss: 6.442566964053368
Epoch: 48


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 86.75it/s, loss=6.44]


Ave Loss: 6.442566923392835
Epoch: 49


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 92.79it/s, loss=6.44]


Ave Loss: 6.4425668753394785
Epoch: 50


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 88.53it/s, loss=6.44]


Ave Loss: 6.442566849464594
Epoch: 51


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 111.69it/s, loss=6.44]


Ave Loss: 6.44256675335788
Epoch: 52


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 78.10it/s, loss=6.44]


Ave Loss: 6.442566709000935
Epoch: 53


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 106.51it/s, loss=6.44]


Ave Loss: 6.442566649858342
Epoch: 54


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 61.88it/s, loss=6.44]


Ave Loss: 6.442566627679869
Epoch: 55


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 114.67it/s, loss=6.44]


Ave Loss: 6.442566583322924
Epoch: 56


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 63.23it/s, loss=6.44]


Ave Loss: 6.442566546358804
Epoch: 57


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 111.02it/s, loss=6.44]


Ave Loss: 6.442566494609034
Epoch: 58


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 112.66it/s, loss=6.44]


Ave Loss: 6.442566450252089
Epoch: 59


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 117.49it/s, loss=6.44]


Ave Loss: 6.442566365234612
Epoch: 60


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 97.88it/s, loss=6.44]


Ave Loss: 6.442566295002782
Epoch: 61


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 60.64it/s, loss=6.44]


Ave Loss: 6.442566250645837
Epoch: 62


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 86.75it/s, loss=6.44]


Ave Loss: 6.442566198896068
Epoch: 63


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 91.55it/s, loss=6.44]


Ave Loss: 6.442566128664239
Epoch: 64


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 94.09it/s, loss=6.44]


Ave Loss: 6.442566084307294
Epoch: 65


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 52.40it/s, loss=6.44]


Ave Loss: 6.442566051039585
Epoch: 66


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 63.00it/s, loss=6.44]


Ave Loss: 6.442565999289815
Epoch: 67


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 107.81it/s, loss=6.44]


Ave Loss: 6.442565969718519
Epoch: 68


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 102.33it/s, loss=6.44]


Ave Loss: 6.442565910575926
Epoch: 69


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 108.41it/s, loss=6.44]


Ave Loss: 6.442565895790278
Epoch: 70


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 90.32it/s, loss=6.44]


Ave Loss: 6.442565840344097
Epoch: 71


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 51.57it/s, loss=6.44]


Ave Loss: 6.442565784897915
Epoch: 72


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 63.12it/s, loss=6.44]


Ave Loss: 6.4425657109696735
Epoch: 73


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 85.18it/s, loss=6.44]


Ave Loss: 6.4425656592199045
Epoch: 74


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 86.31it/s, loss=6.44]


Ave Loss: 6.442565611166548
Epoch: 75


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 91.15it/s, loss=6.44]


Ave Loss: 6.442565563113191
Epoch: 76


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 112.50it/s, loss=6.44]


Ave Loss: 6.442565485488537
Epoch: 77


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 121.84it/s, loss=6.44]


Ave Loss: 6.44256543743518
Epoch: 78


100%|████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 112.54it/s, loss=6.44]


Ave Loss: 6.442565378292587
Epoch: 79


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 92.69it/s, loss=6.44]


Ave Loss: 6.44256529697152
Epoch: 80


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 89.86it/s, loss=6.44]


Ave Loss: 6.442565237828927
Epoch: 81


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 91.38it/s, loss=6.44]


Ave Loss: 6.442565230436103
Epoch: 82


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 97.44it/s, loss=6.44]


Ave Loss: 6.442565186079158
Epoch: 83


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 94.19it/s, loss=6.44]


Ave Loss: 6.442565163900686
Epoch: 84


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 91.88it/s, loss=6.44]


Ave Loss: 6.442565097365269
Epoch: 85


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 94.42it/s, loss=6.44]


Ave Loss: 6.442565053008323
Epoch: 86


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 91.46it/s, loss=6.44]


Ave Loss: 6.442564990169318
Epoch: 87


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 84.37it/s, loss=6.44]


Ave Loss: 6.442564879276956
Epoch: 88


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 87.25it/s, loss=6.44]


Ave Loss: 6.442564831223598
Epoch: 89


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 87.45it/s, loss=6.44]


Ave Loss: 6.442564809045126
Epoch: 90


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 79.04it/s, loss=6.44]


Ave Loss: 6.442564764688181
Epoch: 91


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 83.63it/s, loss=6.44]


Ave Loss: 6.442564738813297
Epoch: 92


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 87.43it/s, loss=6.44]


Ave Loss: 6.442564705545588
Epoch: 93


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 88.30it/s, loss=6.44]


Ave Loss: 6.442564627920935
Epoch: 94


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 87.55it/s, loss=6.44]


Ave Loss: 6.442564594653225
Epoch: 95


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 84.06it/s, loss=6.44]


Ave Loss: 6.442564528117808
Epoch: 96


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 84.25it/s, loss=6.44]


Ave Loss: 6.4425644394039185
Epoch: 97


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 82.27it/s, loss=6.44]


Ave Loss: 6.44256440613621
Epoch: 98


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 84.55it/s, loss=6.44]


Ave Loss: 6.442564306333083
Epoch: 99


100%|█████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 89.87it/s, loss=6.44]

Ave Loss: 6.442564284154611





### Generate Text

In [30]:
# Inspect probability distribution of word tokens
import matplotlib.pyplot as plt
import pandas as pd
random.seed(2)

phrase = [indices_words[random.randint(0, len(unique_words))], indices_words[random.randint(0, len(unique_words))]]
x_ints = [word_indices[item] for item in phrase]
x_vector = []

for item in x_ints:
    x_item = np.zeros(len(unique_words))
    x_item[item] = 1
    x_vector.append(x_item)

initial_input = torch.tensor([np.array([x_vector]).ravel()]).to(dtype=torch.float32).to(device)
output = model(initial_input)[0].detach().cpu().numpy()

df = pd.DataFrame(output)
df.plot.bar()
plt.yscale("log")
plt.show()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [None]:
# Generate text sample from model output
word_count = 100
text = []
paragraph_count = 5

# Length of phrase should be same as block_size
word1, word2 = indices_words[random.randint(0, len(unique_words))], indices_words[random.randint(0, len(unique_words))]

for p in range(paragraph_count):
    text.append([])
    
    for i in range(word_count):
        phrase = [word1, word2]
        x_ints = [word_indices[item] for item in phrase]
        x_vector = []

        for item in x_ints:
            x_item = np.zeros(len(unique_words))
            x_item[item] = 1
            x_vector.append(x_item)

        initial_input = torch.tensor([np.array([x_vector]).ravel()]).float().to(device)

        output = model(initial_input)[0].detach().cpu().numpy()

        # Workaround to fix occasional sum(pvals[:-1]) > 1.0  bug from implicit casting in np.random.multinomial 
        output = output.astype(float)
        output /= output.sum()

        index = np.where(np.random.multinomial(1, output) == 1)[0][0]
        word3 = indices_words[index]
        text[p].append(word3)

        # Use generated word from this run as seed for next run
        word1, word2 = word2, word3

In [None]:
for p in range(paragraph_count):
    print(f"Generated Paragraph {p}:")
    print(' '.join(text[p]))
    print('\n')

### Create Feature Vectors from Input Data

In [None]:
sentences = []

# Tokenize by line
for index, row in positive.iterrows():
    tokenized_row = row['Text'].split(' ')
      
    # Preprocess using the same settings as preprocessing done before training model
    tokenized_row = regexp_tokenize(' '.join(tokenized_row), pattern=r'[^\S\r\n]+|[\.,;!?()--_"]', gaps=True)
    tokenized_row = [lemmatizer.lemmatize(token) for token in tokenized_row] # Lemmatize nouns
    tokenized_row = [lemmatizer.lemmatize(token, 'v') for token in tokenized_row] # Lemmatize verbs
    
    sentences.append(tokenized_row)
    
sentences

In [None]:
vector_size = 100
w2v_model = Word2Vec(sentences, vector_size=vector_size, min_count=1)

In [None]:
vocab = w2v_model.wv.index_to_key
vocab_length = len(vocab)

print(f'Vocabulary Size: {format(vocab_length)}')

In [None]:
vocab

In [None]:
w2v_model.wv.similarity('i', 'me')

In [None]:
def sentence_to_vector_averaging(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in vocab]
    
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(embeddings, axis=0)

In [None]:
features = []

for sentence in sentences:
    features.append(sentence_to_vector_averaging(sentence, w2v_model))

In [None]:
columns = [f'x{i}' for i in range(w2v_model.vector_size)]

df_text = pd.DataFrame(features, columns=columns)

In [None]:
df_text['y'] = [float(x) for x in positive['Sentiment']]
df_text

In [None]:
# Save feature vectors as csv
df_text.to_csv('positive.csv')

### Create Feature Vectors from Generated Data

In [None]:
vector_size = 100
w2v_model = Word2Vec(text, vector_size=vector_size, min_count=1)

In [None]:
vocab = w2v_model.wv.index_to_key
vocab_length = len(vocab)

print(f'Vocabulary Size: {format(vocab_length)}')

In [None]:
vocab

In [None]:
vectors = [w2v_model.wv[word] for word in vocab]
len(vectors[0])

In [None]:
def sentence_to_vector_averaging(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in vocab]
    
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(embeddings, axis=0)

In [None]:
features = []

for sentence in text:
    features.append(sentence_to_vector_averaging(sentence, w2v_model))

In [None]:
columns = [f'x{i}' for i in range(w2v_model.vector_size)]

df_text = pd.DataFrame(features, columns=columns)

In [None]:
df_text['y'] = [1] * len(df_text)
df_text

In [None]:
# Save feature vectors as csv
df_text.to_csv('positive_generated.csv')