# Positive Tone Data Generator

This notebook contains code for an MLP neural network that generates positively toned data based on positive.csv.

TODO: CHANGE THIS SO IT READS FROM ONE CSV WITH ONE COLUMN FOR ALL TEXT AND ONE COLUMN FOR SENTIMENT, THEN THIS NOTEBOOK OUTPUTS POS

### Import Libraries

In [1]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import nltk
from nltk.tokenize import regexp_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import csv
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Irish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Irish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Input

In [2]:
filename = 'positive.csv'
f = open(filename, 'r', encoding='utf-8')

raw_text = f.read()
raw_text[:1000]

'Billionaire investor and Shark Tank star Mark Cuban has launched a generic drug company called the Mark Cuban Cost Plus Drug Company.  The new company\'s website says it is dedicated to producing low-cost versions of high-cost generic drugs and pledges to provide radical transparency in how we price our drugs.  The Cost Plus company will publish the costs to manufacture", distribute and market its drugs to pharmacies. It said it adds a 15 percent margin to get its wholesale prices and make sure it remains profitable", but that there are no middlemen and no rebates for insurance companies.  Everybody gets the same low price for every drug we make", the company website states.  The first drug it will produce is albendazole", an antiparasitic drug. The company is hoping to introduce more than 100 drugs by the end of 2021", as well as build a pharmaceutical factory in Dallas by 2022.    The Mark Cuban Cost Plus Drug Company has also partnered with Baylor College of Medicine in Houston to 

In [3]:
# Remove all non-ASCII characters
processed_text = re.sub(r'[^\x00-\x7f]', r'', raw_text).lower()
processed_text[:1000]

'billionaire investor and shark tank star mark cuban has launched a generic drug company called the mark cuban cost plus drug company.  the new company\'s website says it is dedicated to producing low-cost versions of high-cost generic drugs and pledges to provide radical transparency in how we price our drugs.  the cost plus company will publish the costs to manufacture", distribute and market its drugs to pharmacies. it said it adds a 15 percent margin to get its wholesale prices and make sure it remains profitable", but that there are no middlemen and no rebates for insurance companies.  everybody gets the same low price for every drug we make", the company website states.  the first drug it will produce is albendazole", an antiparasitic drug. the company is hoping to introduce more than 100 drugs by the end of 2021", as well as build a pharmaceutical factory in dallas by 2022.    the mark cuban cost plus drug company has also partnered with baylor college of medicine in houston to 

In [4]:
# Get word tokens from text
word_tokens = regexp_tokenize(processed_text, pattern=r'[^\S\r]+|[\.,;"]', gaps=True)
word_tokens = [word.splitlines() for word in word_tokens] # Split on newline
word_tokens = [word for sublist in word_tokens for word in sublist] # Flatten
word_tokens.append('\n')
print(f"Number of word tokens: {len(word_tokens)}")
word_tokens

Number of word tokens: 12827


['billionaire',
 'investor',
 'and',
 'shark',
 'tank',
 'star',
 'mark',
 'cuban',
 'has',
 'launched',
 'a',
 'generic',
 'drug',
 'company',
 'called',
 'the',
 'mark',
 'cuban',
 'cost',
 'plus',
 'drug',
 'company',
 'the',
 'new',
 "company's",
 'website',
 'says',
 'it',
 'is',
 'dedicated',
 'to',
 'producing',
 'low-cost',
 'versions',
 'of',
 'high-cost',
 'generic',
 'drugs',
 'and',
 'pledges',
 'to',
 'provide',
 'radical',
 'transparency',
 'in',
 'how',
 'we',
 'price',
 'our',
 'drugs',
 'the',
 'cost',
 'plus',
 'company',
 'will',
 'publish',
 'the',
 'costs',
 'to',
 'manufacture',
 'distribute',
 'and',
 'market',
 'its',
 'drugs',
 'to',
 'pharmacies',
 'it',
 'said',
 'it',
 'adds',
 'a',
 '15',
 'percent',
 'margin',
 'to',
 'get',
 'its',
 'wholesale',
 'prices',
 'and',
 'make',
 'sure',
 'it',
 'remains',
 'profitable',
 'but',
 'that',
 'there',
 'are',
 'no',
 'middlemen',
 'and',
 'no',
 'rebates',
 'for',
 'insurance',
 'companies',
 'everybody',
 'gets',


In [5]:
# Get unique word tokens from word tokens
unique_words = sorted(list(set(word_tokens)))
print(f"Number of unique word tokens: {len(unique_words)}")

Number of unique word tokens: 3679


In [6]:
# Create vocabulary of word tokens
word_vocabulary = unique_words
word_vocabulary

['\n',
 '#crushplastic',
 '#freebritney',
 '$1',
 '$113',
 '$159',
 '$399',
 '$5',
 '$5m',
 '$60',
 '$99',
 '&',
 "'",
 "'attorney",
 "'can",
 "'department",
 "'father",
 "'i",
 "'i've",
 "'law",
 "'listen",
 "'trash",
 "'very",
 "'we",
 "'yeah",
 '(10)',
 '(11)',
 '(14)',
 '(233',
 '(5',
 '(96',
 '(@salkhanacademy)',
 '(aal)',
 '(adobestock)',
 '(brc)',
 '(d-fl)',
 '(dal)',
 '(dea)',
 '(foia)',
 '(inntil',
 '(jamie)',
 '(jblu)',
 '(kise)',
 '(known',
 '(maps)',
 '(nida)',
 '(people)',
 '(pictured)',
 '(ptsd)',
 '(reuters)',
 '(right)',
 '(sri)',
 '(the',
 '(ual)',
 '(where',
 ')',
 '-',
 '-dresser',
 '0',
 '000',
 '000)',
 '00:40',
 '1',
 '1%',
 '10',
 '100',
 '106',
 '10p',
 '11',
 '110',
 '12',
 '12-acre',
 '12-year',
 '12-year-old',
 '12bn',
 '13',
 '13%',
 '14',
 '1424',
 '15',
 '150',
 '16',
 '17',
 '18',
 '1861-65',
 '19',
 '1920s',
 '1953',
 '1967-74',
 '1968',
 '1970',
 '1990',
 '1998',
 '2',
 '20',
 '200',
 '2001',
 '2007',
 '2008',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014'

In [7]:
# Create index-word mappings 
indices_words = dict((index, word) for index, word in enumerate(unique_words))
indices_words

{0: '\n',
 1: '#crushplastic',
 2: '#freebritney',
 3: '$1',
 4: '$113',
 5: '$159',
 6: '$399',
 7: '$5',
 8: '$5m',
 9: '$60',
 10: '$99',
 11: '&',
 12: "'",
 13: "'attorney",
 14: "'can",
 15: "'department",
 16: "'father",
 17: "'i",
 18: "'i've",
 19: "'law",
 20: "'listen",
 21: "'trash",
 22: "'very",
 23: "'we",
 24: "'yeah",
 25: '(10)',
 26: '(11)',
 27: '(14)',
 28: '(233',
 29: '(5',
 30: '(96',
 31: '(@salkhanacademy)',
 32: '(aal)',
 33: '(adobestock)',
 34: '(brc)',
 35: '(d-fl)',
 36: '(dal)',
 37: '(dea)',
 38: '(foia)',
 39: '(inntil',
 40: '(jamie)',
 41: '(jblu)',
 42: '(kise)',
 43: '(known',
 44: '(maps)',
 45: '(nida)',
 46: '(people)',
 47: '(pictured)',
 48: '(ptsd)',
 49: '(reuters)',
 50: '(right)',
 51: '(sri)',
 52: '(the',
 53: '(ual)',
 54: '(where',
 55: ')',
 56: '-',
 57: '-dresser',
 58: '0',
 59: '000',
 60: '000)',
 61: '00:40',
 62: '1',
 63: '1%',
 64: '10',
 65: '100',
 66: '106',
 67: '10p',
 68: '11',
 69: '110',
 70: '12',
 71: '12-acre',
 72

In [8]:
# Create word-index mappings
word_indices = dict((word, index) for index, word in enumerate(unique_words))
word_indices

{'\n': 0,
 '#crushplastic': 1,
 '#freebritney': 2,
 '$1': 3,
 '$113': 4,
 '$159': 5,
 '$399': 6,
 '$5': 7,
 '$5m': 8,
 '$60': 9,
 '$99': 10,
 '&': 11,
 "'": 12,
 "'attorney": 13,
 "'can": 14,
 "'department": 15,
 "'father": 16,
 "'i": 17,
 "'i've": 18,
 "'law": 19,
 "'listen": 20,
 "'trash": 21,
 "'very": 22,
 "'we": 23,
 "'yeah": 24,
 '(10)': 25,
 '(11)': 26,
 '(14)': 27,
 '(233': 28,
 '(5': 29,
 '(96': 30,
 '(@salkhanacademy)': 31,
 '(aal)': 32,
 '(adobestock)': 33,
 '(brc)': 34,
 '(d-fl)': 35,
 '(dal)': 36,
 '(dea)': 37,
 '(foia)': 38,
 '(inntil': 39,
 '(jamie)': 40,
 '(jblu)': 41,
 '(kise)': 42,
 '(known': 43,
 '(maps)': 44,
 '(nida)': 45,
 '(people)': 46,
 '(pictured)': 47,
 '(ptsd)': 48,
 '(reuters)': 49,
 '(right)': 50,
 '(sri)': 51,
 '(the': 52,
 '(ual)': 53,
 '(where': 54,
 ')': 55,
 '-': 56,
 '-dresser': 57,
 '0': 58,
 '000': 59,
 '000)': 60,
 '00:40': 61,
 '1': 62,
 '1%': 63,
 '10': 64,
 '100': 65,
 '106': 66,
 '10p': 67,
 '11': 68,
 '110': 69,
 '12': 70,
 '12-acre': 71,
 '1

In [9]:
# Create x (input): Split text into blocks, where each block has the same amount of words
# Create y (targets): For each x input, the y is the word that comes next
# The model should learn to predict y from the input x

block_size = 10
step = 1

x = []
y = []

for i in range(0, len(word_tokens) - block_size, step):
    x.append(word_tokens[i: i+block_size])
    y.append(word_tokens[i + block_size])

In [10]:
# Inspect x
x[0]

['billionaire',
 'investor',
 'and',
 'shark',
 'tank',
 'star',
 'mark',
 'cuban',
 'has',
 'launched']

In [11]:
# Create one-hot encoding of x
x_encoded = []

for x_arr in x:
    x_ints = [word_indices[item] for item in x_arr]
    
    x_row = []
    for item in x_ints:
        x_vector = np.zeros(len(unique_words))
        x_vector[item] = 1
        x_row.append(x_vector)
        
    x_encoded.append(x_row)
    
x_encoded = np.array(x_encoded)
x_encoded[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
# Inspect y
y

['a',
 'generic',
 'drug',
 'company',
 'called',
 'the',
 'mark',
 'cuban',
 'cost',
 'plus',
 'drug',
 'company',
 'the',
 'new',
 "company's",
 'website',
 'says',
 'it',
 'is',
 'dedicated',
 'to',
 'producing',
 'low-cost',
 'versions',
 'of',
 'high-cost',
 'generic',
 'drugs',
 'and',
 'pledges',
 'to',
 'provide',
 'radical',
 'transparency',
 'in',
 'how',
 'we',
 'price',
 'our',
 'drugs',
 'the',
 'cost',
 'plus',
 'company',
 'will',
 'publish',
 'the',
 'costs',
 'to',
 'manufacture',
 'distribute',
 'and',
 'market',
 'its',
 'drugs',
 'to',
 'pharmacies',
 'it',
 'said',
 'it',
 'adds',
 'a',
 '15',
 'percent',
 'margin',
 'to',
 'get',
 'its',
 'wholesale',
 'prices',
 'and',
 'make',
 'sure',
 'it',
 'remains',
 'profitable',
 'but',
 'that',
 'there',
 'are',
 'no',
 'middlemen',
 'and',
 'no',
 'rebates',
 'for',
 'insurance',
 'companies',
 'everybody',
 'gets',
 'the',
 'same',
 'low',
 'price',
 'for',
 'every',
 'drug',
 'we',
 'make',
 'the',
 'company',
 'websi

In [13]:
# Convert each word in y into their corresponding indices
y_ints = [word_indices[item] for item in y]
y_ints

[194,
 1538,
 1162,
 816,
 660,
 3278,
 2112,
 940,
 889,
 2533,
 1162,
 816,
 3278,
 2279,
 817,
 3575,
 2913,
 1872,
 1862,
 1006,
 3331,
 2609,
 2071,
 3495,
 2341,
 1674,
 1538,
 1163,
 356,
 2530,
 3331,
 2649,
 2688,
 3366,
 1769,
 1722,
 3570,
 2588,
 2403,
 1163,
 3278,
 889,
 2533,
 816,
 3607,
 2661,
 3278,
 892,
 3331,
 2104,
 1100,
 356,
 2113,
 1876,
 1163,
 3331,
 2499,
 1872,
 2896,
 1872,
 248,
 194,
 79,
 2477,
 2107,
 3331,
 1541,
 1876,
 3600,
 2589,
 356,
 2094,
 3208,
 1872,
 2769,
 2615,
 653,
 3275,
 3283,
 412,
 2295,
 2168,
 356,
 2295,
 2721,
 1462,
 1831,
 815,
 1284,
 1542,
 3278,
 2903,
 2070,
 2588,
 1462,
 1283,
 1162,
 3570,
 2094,
 3278,
 816,
 3575,
 3139,
 3278,
 1419,
 1162,
 1872,
 3607,
 2606,
 1862,
 305,
 350,
 383,
 1162,
 3278,
 816,
 1862,
 1710,
 3331,
 1845,
 2207,
 3271,
 65,
 1163,
 655,
 3278,
 1231,
 2341,
 112,
 426,
 3582,
 426,
 643,
 194,
 2498,
 1335,
 1769,
 960,
 655,
 113,
 3278,
 2112,
 940,
 889,
 2533,
 1162,
 816,
 1635,
 331

In [14]:
# Create one-hot encoding of y
y_encoded = []

for item in y_ints:
    y_vector = np.zeros(len(unique_words))
    y_vector[item] = 1
    y_encoded.append(y_vector)

y_encoded = np.array(y_encoded)
y_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [15]:
class TextGenerator(nn.Module):
    def __init__(self, input_dim, output_dim, block_size):
        super().__init__()

        self.embeddings = nn.Linear(input_dim, 50 * 2)
        self.hidden = nn.Linear(50 * 2, 200)
        self.output = nn.Linear(200, output_dim)
        
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.sigmoid(self.embeddings(x))
        x = self.tanh(self.hidden(x))
        x = self.softmax(self.output(x))

        return x

In [16]:
# Get size of input for training the model
input_size = x_encoded[0].ravel().shape[0]
input_size

36790

In [17]:
# Instantiate model
model = TextGenerator(input_size, len(unique_words), block_size)

# Print model configuration
model

TextGenerator(
  (embeddings): Linear(in_features=36790, out_features=100, bias=True)
  (hidden): Linear(in_features=100, out_features=200, bias=True)
  (output): Linear(in_features=200, out_features=3679, bias=True)
  (tanh): Tanh()
  (sigmoid): Sigmoid()
  (softmax): Softmax(dim=1)
)

In [18]:
# Create custom Dataset class
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index].ravel(), self.y[index]
    
    def __len__(self):
        return self.n_samples

In [19]:
# Create x and y PyTorch tensors
device = 'cpu'
x = torch.tensor(x_encoded).float().to(device)
y = torch.tensor(y_encoded).float().to(device)

In [20]:
# Create training dataset using custom Dataset class
training_ds = CustomDataset(x, y)

In [21]:
# Load training dataset into DataLoader
from torch.utils.data import DataLoader

batch_size = 5

train_loader = DataLoader(
    training_ds,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

In [22]:
# Define model optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.000000001)
criterion = nn.CrossEntropyLoss()

In [23]:
# Define function to train model
def train_fn(loader, model, optimizer, loss_fn, device):
    loop = tqdm(loader)

    ave_loss = 0
    count = 0 
    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)
        
        # Forward
        predictions = model.forward(data)
        loss = loss_fn(predictions, targets)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update tqdm loading bar
        loop.set_postfix(loss=loss.item())

        count += 1
        ave_loss += loss.item()
    
    ave_loss = ave_loss / count

    return ave_loss

In [24]:
def generate_text(model, seed_text, num_words):
    device = 'cpu'
    model.eval()
    
    # Convert seed_text to input tensor
    seed_encoded = []
    for word in seed_text.split():
        word_index = word_indices[word]
        word_encoded = np.zeros(len(unique_words))
        word_encoded[word_index] = 1
        seed_encoded.append(word_encoded)
    seed_encoded = np.array(seed_encoded)
    seed_encoded = np.expand_dims(seed_encoded, axis=0)
    seed_tensor = torch.tensor(seed_encoded).float().to(device)

    # Generate text
    generated_text = seed_text
    for i in range(num_words):
        predictions = model(seed_tensor)
        predicted_index = torch.argmax(predictions, dim=1).item()
        predicted_word = indices_words[predicted_index]
        generated_text += ' ' + predicted_word
        
        # Update seed tensor with predicted word
        predicted_encoded = np.zeros(len(unique_words))
        predicted_encoded[predicted_index] = 1
        predicted_encoded = np.expand_dims(predicted_encoded, axis=0)
        seed_tensor = torch.cat((seed_tensor[:, 1:, :], torch.tensor(predicted_encoded).float().to(device)), axis=1)

    return generated_text

In [25]:
# Train model
epochs = 2 # TODO: CHANGE TO 300 ON FINAL DATA; CURRENTLY 2 FOR TESTING PURPOSES
average_losses = []

for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    ave_loss = train_fn(train_loader, model, optimizer, criterion, device)
    
    print("Ave Loss: {}".format(ave_loss))
    average_losses.append(ave_loss)

Epoch: 0


 86%|█████████████████████████████████████████████████████████▍         | 2197/2564 [02:13<00:22, 16.47it/s, loss=8.21]


KeyboardInterrupt: 

In [None]:
# Inspect probability distribution of word tokens
import matplotlib.pyplot as plt
import pandas as pd

text = ['\n', '\n']
x_ints = [word_indices[item] for item in text]
x_vector = []

for item in x_ints:
    x_item = np.zeros(len(unique_words))
    x_item[item] = 1
    x_vector.append(x_item)

initial_input = torch.tensor([np.array([x_vector]).ravel()]).to(dtype=torch.float32)
output = model(initial_input)[0].detach().cpu().numpy()

df = pd.DataFrame(output)
df.plot.bar()
plt.yscale("log")
plt.show()

In [None]:
# Generate text sample from model output
word_count = 100
text = []
paragraphs = 5

word1, word2 = "\n", "\n"

for p in range(paragraphs):
    for i in range(word_count):
        phrase = [word1, word2]
        x_ints = [word_indices[item] for item in phrase]
        x_vector = []

        for item in x_ints:
            x_item = np.zeros(len(unique_words))
            x_item[item] = 1
            x_vector.append(x_item)

        initial_input = torch.tensor([np.array([x_vector]).ravel()]).float()

        output = model(initial_input)[0].detach().cpu().numpy()

        # Workaround to fix occasional sum(pvals[:-1]) > 1.0  bug from implicit casting in np.random.multinomial 
        output = output.astype(float)
        output /= output.sum()

        index = np.where(np.random.multinomial(1, output) == 1)[0][0]
        word3 = indices_words[index]
        text.append(word3)

        # Use generated word from this run as seed for next run
        word1, word2 = word2, word3
    
    text.append(r'\n\n')

In [None]:
print("Generated Text:")
text = ' '.join(text)
print(text)

In [None]:
with open('positive_generated.csv', 'a') as csvfile:
    fieldnames = ['result']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writerow({'result': 'True'})