# Positive Tone Data Generator

This notebook contains code for an MLP neural network that generates positively toned data based on the dataset. Feature vectors are then made from the positively toned data and generated data then saved into .csv files.

### Import Libraries

In [1]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import pandas as pd
from gensim.models import Word2Vec
from nltk import regexp_tokenize, WordNetLemmatizer

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Input

In [3]:
df_raw = pd.read_csv('poem_sentiment.csv', header=None, index_col=0, names=['Text', 'Sentiment'])
df_raw

Unnamed: 0,Text,Sentiment
0,with pale blue berries. in these peaceful shad...,1.0
1,it flows so long as falls the rain,0.0
2,"and that is why, the lonesome day",-1.0
3,"when i peruse the conquered fame of heroes, an...",2.0
4,of inward strife for truth and liberty.,2.0
...,...,...
887,to his ears there came a murmur of far seas be...,0.0
888,"the one good man in the world who knows me, --",1.0
889,faint voices lifted shrill with pain,-1.0
890,"an', fust you knowed on, back come charles the...",0.0


In [4]:
positive = df_raw[df_raw['Sentiment'] > 0]
positive

Unnamed: 0,Text,Sentiment
0,with pale blue berries. in these peaceful shad...,1.0
3,"when i peruse the conquered fame of heroes, an...",2.0
4,of inward strife for truth and liberty.,2.0
5,the red sword sealed their vows!,2.0
16,that has a charmingly bourbon air.,1.0
...,...,...
870,their first-born brother as a god.,1.0
876,and so i should be loved and mourned to-night.,2.0
877,"and _channing_, with his bland, superior look",2.0
884,"how your soft opera-music changed, and the dru...",2.0


### Text Preprocessing

In [5]:
raw_text = positive['Text'].to_string(index=False)
raw_text[:1000]

'with pale blue berries. in these peaceful shades--\nwhen i peruse the conquered fame of heroes, and...\n           of inward strife for truth and liberty.\n                  the red sword sealed their vows!\n                that has a charmingly bourbon air.\n          brightly expressive as the twins of leda\n               in monumental pomp! no grecian drop\n                    the hostile cohorts melt away;\nand lips where heavenly smiles would hang and b...\n                         honour to the bugle-horn!\n                       if the pure and holy angels\n        upon the thought of perfect noon. and when\n      thy hands all cunning arts that women prize.\n             reasoning to admiration, and with mee\n           it shines superior on a throne of gold:\n    take the warm welcome of new friends with thee\n                  augmented, sweet, a hundred fold\n                every day a rich reward will give;\n                                 gay little heart!\n         am

In [6]:
# Remove all non-ASCII characters
processed_text = re.sub(r'[^\x00-\x7f]', r'', raw_text).lower()
processed_text[:1000]

'with pale blue berries. in these peaceful shades--\nwhen i peruse the conquered fame of heroes, and...\n           of inward strife for truth and liberty.\n                  the red sword sealed their vows!\n                that has a charmingly bourbon air.\n          brightly expressive as the twins of leda\n               in monumental pomp! no grecian drop\n                    the hostile cohorts melt away;\nand lips where heavenly smiles would hang and b...\n                         honour to the bugle-horn!\n                       if the pure and holy angels\n        upon the thought of perfect noon. and when\n      thy hands all cunning arts that women prize.\n             reasoning to admiration, and with mee\n           it shines superior on a throne of gold:\n    take the warm welcome of new friends with thee\n                  augmented, sweet, a hundred fold\n                every day a rich reward will give;\n                                 gay little heart!\n         am

### Tokenization

In [7]:
# Get word tokens from text
word_tokens = regexp_tokenize(processed_text, pattern=r'[^\S\r]+|[\.,:;!?()--_"]', gaps=True)
word_tokens.append('\n')
print(f"Number of word tokens: {len(word_tokens)}")
word_tokens[:10]

Number of word tokens: 1293


['with',
 'pale',
 'blue',
 'berries',
 'in',
 'these',
 'peaceful',
 'shades',
 'when',
 'i']

In [8]:
# Lemmatization done to make uncommon words more likely to be recognized by 
# Word2Vec model later when converting to feature vectors
lemmatizer = WordNetLemmatizer()
word_tokens = [lemmatizer.lemmatize(token) for token in word_tokens] # Lemmatize nouns
word_tokens = [lemmatizer.lemmatize(token, 'v') for token in word_tokens] # Lemmatize verbs

In [9]:
# Get unique word tokens from word tokens
unique_words = sorted(list(set(word_tokens)))
print(f"Number of unique word tokens: {len(unique_words)}")

Number of unique word tokens: 629


In [10]:
# Create vocabulary of word tokens
word_vocabulary = unique_words
word_vocabulary[:10]

['\n',
 "'tis",
 'a',
 'abide',
 'abloom',
 'about',
 'accordance',
 'adam',
 'adept',
 'admiration']

### Create word-index mappings

In [11]:
# Create index-word mappings 
indices_words = dict((index, word) for index, word in enumerate(unique_words))
indices_words

{0: '\n',
 1: "'tis",
 2: 'a',
 3: 'abide',
 4: 'abloom',
 5: 'about',
 6: 'accordance',
 7: 'adam',
 8: 'adept',
 9: 'admiration',
 10: 'after',
 11: 'again',
 12: 'ah',
 13: 'air',
 14: 'all',
 15: 'already',
 16: 'amidst',
 17: 'among',
 18: 'an',
 19: 'and',
 20: 'angel',
 21: 'angry',
 22: 'arm',
 23: 'around',
 24: 'art',
 25: 'ascend',
 26: 'ash',
 27: 'aspire',
 28: 'assay',
 29: 'augment',
 30: 'away',
 31: 'awe',
 32: 'ay',
 33: 'b',
 34: 'bare',
 35: 'be',
 36: 'bear',
 37: 'beauteous',
 38: 'beautiful',
 39: 'beauty',
 40: "beauty'",
 41: 'because',
 42: 'before',
 43: 'bell',
 44: 'bend',
 45: 'beneath',
 46: 'berry',
 47: 'best',
 48: 'betray',
 49: 'between',
 50: 'blade',
 51: 'bland',
 52: 'blaze',
 53: 'bless',
 54: 'blind',
 55: 'blue',
 56: 'bolt',
 57: 'borrow',
 58: 'boston',
 59: 'bourbon',
 60: 'bow',
 61: 'brave',
 62: 'braver',
 63: 'breast',
 64: 'bright',
 65: 'brightly',
 66: 'brilliant',
 67: 'brother',
 68: "brynhilda's",
 69: 'bugle',
 70: 'burn',
 71: '

In [12]:
# Create word-index mappings
word_indices = dict((word, index) for index, word in enumerate(unique_words))
word_indices

{'\n': 0,
 "'tis": 1,
 'a': 2,
 'abide': 3,
 'abloom': 4,
 'about': 5,
 'accordance': 6,
 'adam': 7,
 'adept': 8,
 'admiration': 9,
 'after': 10,
 'again': 11,
 'ah': 12,
 'air': 13,
 'all': 14,
 'already': 15,
 'amidst': 16,
 'among': 17,
 'an': 18,
 'and': 19,
 'angel': 20,
 'angry': 21,
 'arm': 22,
 'around': 23,
 'art': 24,
 'ascend': 25,
 'ash': 26,
 'aspire': 27,
 'assay': 28,
 'augment': 29,
 'away': 30,
 'awe': 31,
 'ay': 32,
 'b': 33,
 'bare': 34,
 'be': 35,
 'bear': 36,
 'beauteous': 37,
 'beautiful': 38,
 'beauty': 39,
 "beauty'": 40,
 'because': 41,
 'before': 42,
 'bell': 43,
 'bend': 44,
 'beneath': 45,
 'berry': 46,
 'best': 47,
 'betray': 48,
 'between': 49,
 'blade': 50,
 'bland': 51,
 'blaze': 52,
 'bless': 53,
 'blind': 54,
 'blue': 55,
 'bolt': 56,
 'borrow': 57,
 'boston': 58,
 'bourbon': 59,
 'bow': 60,
 'brave': 61,
 'braver': 62,
 'breast': 63,
 'bright': 64,
 'brightly': 65,
 'brilliant': 66,
 'brother': 67,
 "brynhilda's": 68,
 'bugle': 69,
 'burn': 70,
 'burs

### Create Blocks

In [13]:
# Create x (input): Split text into blocks, where each block has the same amount of words
# Create y (targets): For each x input, the y is the word that comes next
# The model should learn to predict y from the input x

block_size = 2
step = 1

x = []
y = []

for i in range(0, len(word_tokens) - block_size, step):
    x.append(word_tokens[i: i+block_size])
    y.append(word_tokens[i + block_size])

In [14]:
# Inspect x
x[:5]

[['with', 'pale'],
 ['pale', 'blue'],
 ['blue', 'berry'],
 ['berry', 'in'],
 ['in', 'these']]

In [15]:
# Check number of blocks
len(x)

1291

### Create One-Hot Encoding

In [16]:
# Create one-hot encoding of x
x_encoded = []

for x_arr in x:
    x_ints = [word_indices[item] for item in x_arr]
    
    x_row = []
    for item in x_ints:
        x_vector = np.zeros(len(unique_words))
        x_vector[item] = 1
        x_row.append(x_vector)
        
    x_encoded.append(x_row)
    
x_encoded = np.array(x_encoded)

In [17]:
# Inspect y
y[:5]

['blue', 'berry', 'in', 'these', 'peaceful']

In [18]:
# Convert each word in y into their corresponding indices
y_ints = [word_indices[item] for item in y]
y_ints[:5]

[55, 46, 258, 540, 387]

In [19]:
# Create one-hot encoding of y
y_encoded = []

for item in y_ints:
    y_vector = np.zeros(len(unique_words))
    y_vector[item] = 1
    y_encoded.append(y_vector)

y_encoded = np.array(y_encoded)

### Model Configuration

In [20]:
class TextGenerator(nn.Module):
    def __init__(self, input_dim, output_dim, block_size):
        super().__init__()

        self.embeddings = nn.Linear(input_dim, 2000)
        self.hidden = nn.Linear(2000, 1200)
        self.output = nn.Linear(1200, output_dim)
        
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.sigmoid(self.embeddings(x))
        x = self.tanh(self.hidden(x))
        x = self.softmax(self.output(x))

        return x

In [21]:
# Get size of input for training the model
input_size = x_encoded[0].ravel().shape[0]
print(x_encoded[0].ravel().shape[0])

1258


In [22]:
# Allocate tensors to the device used for computation
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Performing torch operations on {device} device")

# Create x and y PyTorch tensors
x = torch.tensor(x_encoded).float().to(device)
y = torch.tensor(y_encoded).float().to(device)

Performing torch operations on cpu device


In [23]:
# Instantiate model
model = TextGenerator(input_size, len(unique_words), block_size).to(device)

# Print model configuration
model

TextGenerator(
  (embeddings): Linear(in_features=1258, out_features=2000, bias=True)
  (hidden): Linear(in_features=2000, out_features=1200, bias=True)
  (output): Linear(in_features=1200, out_features=629, bias=True)
  (tanh): Tanh()
  (sigmoid): Sigmoid()
  (softmax): Softmax(dim=1)
)

In [24]:
# Define model optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.00001)
criterion = nn.CrossEntropyLoss()

### Create Dataset & DataLoader

In [25]:
# Create custom Dataset class
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index].ravel(), self.y[index]
    
    def __len__(self):
        return self.n_samples

In [26]:
# Create training dataset using custom Dataset class
training_ds = CustomDataset(x, y)

In [27]:
# Load training dataset into DataLoader
from torch.utils.data import DataLoader

batch_size = 10

train_loader = DataLoader(
    training_ds,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

### Train Model

In [28]:
# Define function to train model
def train_fn(loader, model, optimizer, loss_fn, device):
    loop = tqdm(loader)

    ave_loss = 0
    count = 0 
    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)
        
        # Forward
        predictions = model.forward(data)
        loss = loss_fn(predictions, targets)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update tqdm loading bar
        loop.set_postfix(loss=loss.item())

        count += 1
        ave_loss += loss.item()
    
    ave_loss = ave_loss / count

    return ave_loss

In [None]:
# Train model
epochs = 2 # TODO: CHANGE TO 300 ON FINAL DATA; CURRENTLY 2 FOR TESTING PURPOSES
average_losses = []

for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    ave_loss = train_fn(train_loader, model, optimizer, criterion, device)
    
    print("Ave Loss: {}".format(ave_loss))
    average_losses.append(ave_loss)

Epoch: 0


 82%|████████████████████████████████████████████████████████▊            | 107/130 [00:06<00:01, 13.38it/s, loss=6.45]

### Generate Text

In [None]:
def generate_text(model, seed_text, num_words):
    device = 'cpu'
    model.eval()
    
    # Convert seed_text to input tensor
    seed_encoded = []
    for word in seed_text.split():
        word_index = word_indices[word]
        word_encoded = np.zeros(len(unique_words))
        word_encoded[word_index] = 1
        seed_encoded.append(word_encoded)
    seed_encoded = np.array(seed_encoded)
    seed_encoded = np.expand_dims(seed_encoded, axis=0)
    seed_tensor = torch.tensor(seed_encoded).float().to(device)

    # Generate text
    generated_text = seed_text
    for i in range(num_words):
        predictions = model(seed_tensor)
        predicted_index = torch.argmax(predictions, dim=1).item()
        predicted_word = indices_words[predicted_index]
        generated_text += ' ' + predicted_word
        
        # Update seed tensor with predicted word
        predicted_encoded = np.zeros(len(unique_words))
        predicted_encoded[predicted_index] = 1
        predicted_encoded = np.expand_dims(predicted_encoded, axis=0)
        seed_tensor = torch.cat((seed_tensor[:, 1:, :], torch.tensor(predicted_encoded).float().to(device)), axis=1)

    return generated_text

In [None]:
# Generate text sample from model output
word_count = 100
text = []
paragraph_count = 5

# Length of phrase should be same as block_size
word1, word2 = "\n", "\n"

for p in range(paragraph_count):
    text.append([])
    
    for i in range(word_count):
        phrase = [word1, word2]
        x_ints = [word_indices[item] for item in phrase]
        x_vector = []

        for item in x_ints:
            x_item = np.zeros(len(unique_words))
            x_item[item] = 1
            x_vector.append(x_item)

        initial_input = torch.tensor([np.array([x_vector]).ravel()]).float()

        output = model(initial_input)[0].detach().cpu().numpy()

        # Workaround to fix occasional sum(pvals[:-1]) > 1.0  bug from implicit casting in np.random.multinomial 
        output = output.astype(float)
        output /= output.sum()

        index = np.where(np.random.multinomial(1, output) == 1)[0][0]
        word3 = indices_words[index]
        text[p].append(word3)

        # Use generated word from this run as seed for next run
        word1, word2 = word2, word3

In [None]:
for p in range(paragraph_count):
    print(f"Generated Paragraph {p}:")
    print(' '.join(text[p]))

### Create Feature Vectors from Input Data

In [None]:
sentences = []

# Tokenize by line
for index, row in positive.iterrows():
    tokenized_row = row['Text'].split(' ')
    
    # Preprocess using the same settings as preprocessing done before training model
    tokenized_row = regexp_tokenize(' '.join(tokenized_row), pattern=r'[^\S\r]+|[\.,;!?()--_"]', gaps=True)
    tokenized_row = [lemmatizer.lemmatize(token) for token in tokenized_row] # Lemmatize nouns
    tokenized_row = [lemmatizer.lemmatize(token, 'v') for token in tokenized_row] # Lemmatize verbs
    
    sentences.append(tokenized_row)
    
sentences

In [None]:
vector_size = 100
w2v_model = Word2Vec(sentences, vector_size=vector_size)

In [None]:
vocab = w2v_model.wv.index_to_key
vocab_length = len(vocab)

# Note: Low vocab size is because Word2Vec model isn't familiar with most of the words in our dataset
# From https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
print(f'Vocabulary Size: {format(vocab_length)}')

In [None]:
vocab

In [None]:
w2v_model.wv.similarity('my', 'me')

In [None]:
def sentence_to_vector_averaging(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in vocab]
    
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(embeddings, axis=0)

In [None]:
features = []

for sentence in sentences:
    features.append(sentence_to_vector_averaging(sentence, w2v_model))

In [None]:
columns = [f'x{i}' for i in range(w2v_model.vector_size)]

df_text = pd.DataFrame(features, columns=columns)

In [None]:
df_text['y'] = [float(x) for x in positive['Sentiment']]
df_text

In [None]:
# Save feature vectors as csv
df_text.to_csv('positive.csv')

### Create Feature Vectors from Generated Data

In [None]:
vector_size = 100
w2v_model = Word2Vec(text, vector_size=vector_size)

In [None]:
vocab = w2v_model.wv.index_to_key
vocab_length = len(vocab)

# Note: Low vocab size is because Word2Vec model isn't familiar with most of the words in our dataset
# From https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
print(f'Vocabulary Size: {format(vocab_length)}')

In [None]:
vocab

In [None]:
vectors = [w2v_model.wv[word] for word in vocab]
len(vectors[0])

In [None]:
def sentence_to_vector_averaging(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in vocab]
    
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(embeddings, axis=0)

In [None]:
features = []

for sentence in sentences:
    features.append(sentence_to_vector_averaging(sentence, w2v_model))

In [None]:
columns = [f'x{i}' for i in range(w2v_model.vector_size)]

df_text = pd.DataFrame(features, columns=columns)

In [None]:
df_text['y'] = [1] * len(df_text)
df_text

In [None]:
# Save feature vectors as csv
df_text.to_csv('positive_generated.csv')