# Negative Tone Data Generator

This notebook contains code for an MLP neural network that generates negatively toned data based on the dataset. Feature vectors are then made from the negatively toned data and generated data then saved into .csv files.

### Import Libraries

In [1]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import pandas as pd
from gensim.models import Word2Vec
from nltk import regexp_tokenize

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Input

In [3]:
df_raw = pd.read_csv('poem_sentiment.csv', header=None, index_col=0, names=['Text', 'Sentiment'])
df_raw

Unnamed: 0,Text,Sentiment
0,with pale blue berries. in these peaceful shad...,1.0
1,it flows so long as falls the rain,0.0
2,"and that is why, the lonesome day",-1.0
3,"when i peruse the conquered fame of heroes, an...",2.0
4,of inward strife for truth and liberty.,2.0
...,...,...
887,to his ears there came a murmur of far seas be...,0.0
888,"the one good man in the world who knows me, --",1.0
889,faint voices lifted shrill with pain,-1.0
890,"an', fust you knowed on, back come charles the...",0.0


In [4]:
negative = df_raw[df_raw['Sentiment'] < 0]
negative

Unnamed: 0,Text,Sentiment
2,"and that is why, the lonesome day",-1.0
8,and so on. then a worthless gaud or two,-1.0
17,sounded o'er earth and sea its blast of war,-1.0
37,"want and woe, which torture us",-1.0
39,an echo returned on the cold gray morn,-1.0
...,...,...
874,"in town, an' not the leanest runt",-1.0
883,by death's frequented ways,-1.0
885,rejection of his humanness,-1.0
889,faint voices lifted shrill with pain,-1.0


### Text Preprocessing

In [5]:
raw_text = negative['Text'].to_string(index=False)
raw_text[:1000]

"                 and that is why, the lonesome day\n           and so on. then a worthless gaud or two\n       sounded o'er earth and sea its blast of war\n                    want and woe, which torture us\n            an echo returned on the cold gray morn\n       while i, ... i built up follies like a wall\n          ah, what a pang of aching sharp surprise\n                 and the old swallow-haunted barns\n     the which she bearing home it burned her nest\n    the crown of sorrow on their heads, their loss\n               i lay and watched the lonely gloom;\n          a sceptremonstrous, winged, intolerable.\n while the rude winds blow off each shadowy crown.\n         but o, nevermore can we prison him tight.\n                 may meditate a whole youth's loss\n        when thee, the eyes of that harsh long ago\n        the foes inclosing, and his friend pursued\nand bow to dread inquisitor and worship lords o...\n        miles off, three dangerous miles, is home;\n      else,

In [6]:
# Remove all non-ASCII characters
processed_text = re.sub(r'[^\x00-\x7f]', r'', raw_text).lower()
processed_text[:1000]

"                 and that is why, the lonesome day\n           and so on. then a worthless gaud or two\n       sounded o'er earth and sea its blast of war\n                    want and woe, which torture us\n            an echo returned on the cold gray morn\n       while i, ... i built up follies like a wall\n          ah, what a pang of aching sharp surprise\n                 and the old swallow-haunted barns\n     the which she bearing home it burned her nest\n    the crown of sorrow on their heads, their loss\n               i lay and watched the lonely gloom;\n          a sceptremonstrous, winged, intolerable.\n while the rude winds blow off each shadowy crown.\n         but o, nevermore can we prison him tight.\n                 may meditate a whole youth's loss\n        when thee, the eyes of that harsh long ago\n        the foes inclosing, and his friend pursued\nand bow to dread inquisitor and worship lords o...\n        miles off, three dangerous miles, is home;\n      else,

### Tokenization

In [7]:
# Get word tokens from text
word_tokens = regexp_tokenize(processed_text, pattern=r'[^\S\r]+|[\.,;!?()--_"]', gaps=True)
word_tokens.append('\n')
print(f"Number of word tokens: {len(word_tokens)}")
word_tokens[:10]

Number of word tokens: 1113


['and', 'that', 'is', 'why', 'the', 'lonesome', 'day', 'and', 'so', 'on']

In [8]:
# Get unique word tokens from word tokens
unique_words = sorted(list(set(word_tokens)))
print(f"Number of unique word tokens: {len(unique_words)}")

Number of unique word tokens: 615


In [9]:
# Create vocabulary of word tokens
word_vocabulary = unique_words
word_vocabulary[:10]

['\n',
 "'",
 "'twas",
 'a',
 'accomplish',
 'aching',
 'added',
 'adulterate',
 'afar',
 'after']

### Create word-index mappings

In [10]:
# Create index-word mappings 
indices_words = dict((index, word) for index, word in enumerate(unique_words))
indices_words

{0: '\n',
 1: "'",
 2: "'twas",
 3: 'a',
 4: 'accomplish',
 5: 'aching',
 6: 'added',
 7: 'adulterate',
 8: 'afar',
 9: 'after',
 10: 'age',
 11: 'ago',
 12: 'ah',
 13: 'air',
 14: 'all',
 15: 'altar',
 16: 'always',
 17: 'am',
 18: 'among',
 19: 'an',
 20: "an'",
 21: 'and',
 22: 'angel',
 23: 'answer',
 24: 'anxious',
 25: 'are',
 26: 'around',
 27: 'arrows',
 28: 'as',
 29: 'ashes',
 30: 'ask',
 31: 'at',
 32: 'augurs',
 33: 'avenging',
 34: 'away',
 35: 'b',
 36: 'bad',
 37: 'barns',
 38: 'barrenly',
 39: 'bat',
 40: 'be',
 41: 'beam',
 42: 'bearing',
 43: 'beat',
 44: 'beaten',
 45: 'because',
 46: 'become',
 47: 'beds',
 48: 'been',
 49: 'beguiled',
 50: 'behold',
 51: 'bitter',
 52: 'black',
 53: 'blankness',
 54: 'blast',
 55: 'bleeding',
 56: 'blending',
 57: 'blighting',
 58: 'blind',
 59: 'blindness',
 60: 'blood',
 61: 'blow',
 62: 'body',
 63: 'bound',
 64: 'bow',
 65: 'brand',
 66: 'breast',
 67: 'briareus',
 68: 'broke',
 69: 'built',
 70: 'burned',
 71: 'burning',
 72: 

In [11]:
# Create word-index mappings
word_indices = dict((word, index) for index, word in enumerate(unique_words))
word_indices

{'\n': 0,
 "'": 1,
 "'twas": 2,
 'a': 3,
 'accomplish': 4,
 'aching': 5,
 'added': 6,
 'adulterate': 7,
 'afar': 8,
 'after': 9,
 'age': 10,
 'ago': 11,
 'ah': 12,
 'air': 13,
 'all': 14,
 'altar': 15,
 'always': 16,
 'am': 17,
 'among': 18,
 'an': 19,
 "an'": 20,
 'and': 21,
 'angel': 22,
 'answer': 23,
 'anxious': 24,
 'are': 25,
 'around': 26,
 'arrows': 27,
 'as': 28,
 'ashes': 29,
 'ask': 30,
 'at': 31,
 'augurs': 32,
 'avenging': 33,
 'away': 34,
 'b': 35,
 'bad': 36,
 'barns': 37,
 'barrenly': 38,
 'bat': 39,
 'be': 40,
 'beam': 41,
 'bearing': 42,
 'beat': 43,
 'beaten': 44,
 'because': 45,
 'become': 46,
 'beds': 47,
 'been': 48,
 'beguiled': 49,
 'behold': 50,
 'bitter': 51,
 'black': 52,
 'blankness': 53,
 'blast': 54,
 'bleeding': 55,
 'blending': 56,
 'blighting': 57,
 'blind': 58,
 'blindness': 59,
 'blood': 60,
 'blow': 61,
 'body': 62,
 'bound': 63,
 'bow': 64,
 'brand': 65,
 'breast': 66,
 'briareus': 67,
 'broke': 68,
 'built': 69,
 'burned': 70,
 'burning': 71,
 'but

### Create Blocks

In [12]:
# Create x (input): Split text into blocks, where each block has the same amount of words
# Create y (targets): For each x input, the y is the word that comes next
# The model should learn to predict y from the input x

block_size = 2
step = 1

x = []
y = []

for i in range(0, len(word_tokens) - block_size, step):
    x.append(word_tokens[i: i+block_size])
    y.append(word_tokens[i + block_size])

In [13]:
# Inspect x
x[:5]

[['and', 'that'],
 ['that', 'is'],
 ['is', 'why'],
 ['why', 'the'],
 ['the', 'lonesome']]

In [14]:
# Check number of blocks
len(x)

1111

### Create One-Hot Encoding

In [15]:
# Create one-hot encoding of x
x_encoded = []

for x_arr in x:
    x_ints = [word_indices[item] for item in x_arr]
    
    x_row = []
    for item in x_ints:
        x_vector = np.zeros(len(unique_words))
        x_vector[item] = 1
        x_row.append(x_vector)
        
    x_encoded.append(x_row)
    
x_encoded = np.array(x_encoded)

In [16]:
# Inspect y
y[:5]

['is', 'why', 'the', 'lonesome', 'day']

In [17]:
# Convert each word in y into their corresponding indices
y_ints = [word_indices[item] for item in y]
y_ints[:5]

[263, 588, 506, 291, 110]

In [18]:
# Create one-hot encoding of y
y_encoded = []

for item in y_ints:
    y_vector = np.zeros(len(unique_words))
    y_vector[item] = 1
    y_encoded.append(y_vector)

y_encoded = np.array(y_encoded)

### Model Configuration

In [19]:
class TextGenerator(nn.Module):
    def __init__(self, input_dim, output_dim, block_size):
        super().__init__()

        self.embeddings = nn.Linear(input_dim, 2000)
        self.hidden = nn.Linear(2000, 1200)
        self.output = nn.Linear(1200, output_dim)
        
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.sigmoid(self.embeddings(x))
        x = self.tanh(self.hidden(x))
        x = self.softmax(self.output(x))

        return x

In [20]:
# Get size of input for training the model
input_size = x_encoded[0].ravel().shape[0]
print(x_encoded[0].ravel().shape[0])

1230


In [21]:
# Allocate tensors to the device used for computation
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Performing torch operations on {device} device")

# Create x and y PyTorch tensors
x = torch.tensor(x_encoded).float().to(device)
y = torch.tensor(y_encoded).float().to(device)

Performing torch operations on cpu device


In [22]:
# Instantiate model
model = TextGenerator(input_size, len(unique_words), block_size).to(device)

# Print model configuration
model

TextGenerator(
  (embeddings): Linear(in_features=1230, out_features=2000, bias=True)
  (hidden): Linear(in_features=2000, out_features=1200, bias=True)
  (output): Linear(in_features=1200, out_features=615, bias=True)
  (tanh): Tanh()
  (sigmoid): Sigmoid()
  (softmax): Softmax(dim=1)
)

In [23]:
# Define model optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.00001)
criterion = nn.CrossEntropyLoss()

### Create Dataset & DataLoader

In [24]:
# Create custom Dataset class
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index].ravel(), self.y[index]
    
    def __len__(self):
        return self.n_samples

In [25]:
# Create training dataset using custom Dataset class
training_ds = CustomDataset(x, y)

In [26]:
# Load training dataset into DataLoader
from torch.utils.data import DataLoader

batch_size = 10

train_loader = DataLoader(
    training_ds,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

### Train Model

In [27]:
# Define function to train model
def train_fn(loader, model, optimizer, loss_fn, device):
    loop = tqdm(loader)

    ave_loss = 0
    count = 0 
    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)
        
        # Forward
        predictions = model.forward(data)
        loss = loss_fn(predictions, targets)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update tqdm loading bar
        loop.set_postfix(loss=loss.item())

        count += 1
        ave_loss += loss.item()
    
    ave_loss = ave_loss / count

    return ave_loss

In [28]:
# Train model
epochs = 2 # TODO: CHANGE TO 300 ON FINAL DATA; CURRENTLY 2 FOR TESTING PURPOSES
average_losses = []

for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    ave_loss = train_fn(train_loader, model, optimizer, criterion, device)
    
    print("Ave Loss: {}".format(ave_loss))
    average_losses.append(ave_loss)

Epoch: 0


100%|█████████████████████████████████████████████████████████████████████| 112/112 [00:06<00:00, 17.82it/s, loss=6.42]


Ave Loss: 6.400432322706495
Epoch: 1


100%|█████████████████████████████████████████████████████████████████████| 112/112 [00:06<00:00, 17.50it/s, loss=6.42]

Ave Loss: 6.363841950893402





### Generate Text

In [29]:
def generate_text(model, seed_text, num_words):
    device = 'cpu'
    model.eval()
    
    # Convert seed_text to input tensor
    seed_encoded = []
    for word in seed_text.split():
        word_index = word_indices[word]
        word_encoded = np.zeros(len(unique_words))
        word_encoded[word_index] = 1
        seed_encoded.append(word_encoded)
    seed_encoded = np.array(seed_encoded)
    seed_encoded = np.expand_dims(seed_encoded, axis=0)
    seed_tensor = torch.tensor(seed_encoded).float().to(device)

    # Generate text
    generated_text = seed_text
    for i in range(num_words):
        predictions = model(seed_tensor)
        predicted_index = torch.argmax(predictions, dim=1).item()
        predicted_word = indices_words[predicted_index]
        generated_text += ' ' + predicted_word
        
        # Update seed tensor with predicted word
        predicted_encoded = np.zeros(len(unique_words))
        predicted_encoded[predicted_index] = 1
        predicted_encoded = np.expand_dims(predicted_encoded, axis=0)
        seed_tensor = torch.cat((seed_tensor[:, 1:, :], torch.tensor(predicted_encoded).float().to(device)), axis=1)

    return generated_text

In [30]:
# Generate text sample from model output
word_count = 100
text = []
paragraph_count = 5

# Length of phrase should be same as block_size
word1, word2 = "\n", "\n"

for p in range(paragraph_count):
    text.append([])
    
    for i in range(word_count):
        phrase = [word1, word2]
        x_ints = [word_indices[item] for item in phrase]
        x_vector = []

        for item in x_ints:
            x_item = np.zeros(len(unique_words))
            x_item[item] = 1
            x_vector.append(x_item)

        initial_input = torch.tensor([np.array([x_vector]).ravel()]).float()

        output = model(initial_input)[0].detach().cpu().numpy()

        # Workaround to fix occasional sum(pvals[:-1]) > 1.0  bug from implicit casting in np.random.multinomial 
        output = output.astype(float)
        output /= output.sum()

        index = np.where(np.random.multinomial(1, output) == 1)[0][0]
        word3 = indices_words[index]
        text[p].append(word3)

        # Use generated word from this run as seed for next run
        word1, word2 = word2, word3

In [31]:
for p in range(paragraph_count):
    print(f"Generated Paragraph {p}:")
    print(' '.join(text[p]))

Generated Paragraph 0:
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the moods the
Generated Paragraph 1:
the the the the the the the the the the the the the the the the the the the the the the dwell the the the the and the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Generated Paragraph 2:
the death's the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

### Create Feature Vectors from Input Data

In [32]:
sentences = []

# Tokenize by line
for index, row in negative.iterrows():
    tokenized_row = row['Text'].split(' ')
    
    # Preprocess using the same settings as preprocessing done before training model
    tokenized_row = regexp_tokenize(' '.join(tokenized_row), pattern=r'[^\S\r]+|[\.,;!?()--_"]', gaps=True)
    sentences.append(tokenized_row)
    
sentences

[['and', 'that', 'is', 'why', 'the', 'lonesome', 'day'],
 ['and', 'so', 'on', 'then', 'a', 'worthless', 'gaud', 'or', 'two'],
 ['sounded', "o'er", 'earth', 'and', 'sea', 'its', 'blast', 'of', 'war'],
 ['want', 'and', 'woe', 'which', 'torture', 'us'],
 ['an', 'echo', 'returned', 'on', 'the', 'cold', 'gray', 'morn'],
 ['while', 'i', 'i', 'built', 'up', 'follies', 'like', 'a', 'wall'],
 ['ah', 'what', 'a', 'pang', 'of', 'aching', 'sharp', 'surprise'],
 ['and', 'the', 'old', 'swallow', 'haunted', 'barns'],
 ['the', 'which', 'she', 'bearing', 'home', 'it', 'burned', 'her', 'nest'],
 ['the', 'crown', 'of', 'sorrow', 'on', 'their', 'heads', 'their', 'loss'],
 ['i', 'lay', 'and', 'watched', 'the', 'lonely', 'gloom'],
 ['a', 'sceptremonstrous', 'winged', 'intolerable'],
 ['while', 'the', 'rude', 'winds', 'blow', 'off', 'each', 'shadowy', 'crown'],
 ['but', 'o', 'nevermore', 'can', 'we', 'prison', 'him', 'tight'],
 ['may', 'meditate', 'a', 'whole', "youth's", 'loss'],
 ['when', 'thee', 'the', 'e

In [33]:
vector_size = 100
w2v_model = Word2Vec(sentences, vector_size=vector_size)

In [34]:
vocab = w2v_model.wv.index_to_key
vocab_length = len(vocab)

# Note: Low vocab size is because Word2Vec model isn't familiar with most of the words in our dataset
# From https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
print(f'Vocabulary Size: {format(vocab_length)}')

Vocabulary Size: 25


In [35]:
vocab

['the',
 'and',
 'of',
 'a',
 'to',
 'in',
 'his',
 'with',
 'that',
 'but',
 'on',
 'it',
 'is',
 'by',
 'their',
 'i',
 'which',
 'when',
 'as',
 'thy',
 'from',
 'for',
 'are',
 'eyes',
 'he']

In [37]:
w2v_model.wv.similarity('he', 'his')

-0.048020355

In [38]:
def sentence_to_vector_averaging(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in vocab]
    
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(embeddings, axis=0)

In [39]:
features = []

for sentence in sentences:
    features.append(sentence_to_vector_averaging(sentence, w2v_model))

In [40]:
columns = [f'x{i}' for i in range(w2v_model.vector_size)]

df_text = pd.DataFrame(features, columns=columns)

In [41]:
df_text['y'] = [float(x) for x in negative['Sentiment']]
df_text

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x91,x92,x93,x94,x95,x96,x97,x98,x99,y
0,-0.005187,0.001937,0.006015,0.003685,0.002358,-0.001292,0.004020,0.005023,-0.001249,-0.002851,...,-0.000705,0.003824,-0.004248,0.005150,0.004191,0.001900,0.000904,0.001022,0.003322,-1.0
1,-0.003266,0.003844,0.004352,-0.001904,0.001380,-0.005685,-0.000023,0.006106,-0.000243,-0.007390,...,-0.000379,0.000116,-0.001590,0.000530,0.000797,0.006901,-0.005118,-0.002738,-0.003019,-1.0
2,-0.004293,0.003431,-0.000727,0.002193,0.007601,0.000509,-0.001241,0.004525,-0.005658,-0.000064,...,0.002059,0.005807,-0.006058,0.002748,0.004122,0.005618,-0.000969,-0.000661,-0.001191,-1.0
3,-0.003681,-0.003024,0.004939,0.002612,0.006926,-0.002247,-0.000984,0.007034,-0.000704,-0.003113,...,0.001620,0.005519,-0.000442,-0.004109,0.005016,-0.001963,-0.002506,-0.008639,0.004395,-1.0
4,0.003262,-0.000633,0.006558,-0.000230,-0.008644,-0.006919,0.001255,0.007071,-0.004451,-0.006085,...,0.000844,0.001158,0.002972,0.002057,0.002585,0.000266,-0.006272,0.002355,-0.000836,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,-0.004648,0.001231,0.002151,-0.000129,-0.009341,-0.004318,0.005480,0.006452,-0.005813,-0.005368,...,0.004563,-0.002304,-0.001423,0.009843,0.005124,-0.005257,-0.007879,0.001926,-0.000112,-1.0
151,-0.009499,0.009568,-0.007775,-0.002644,-0.004905,-0.004970,-0.008018,-0.007785,-0.004550,-0.001279,...,0.007232,0.001728,-0.001341,-0.005889,-0.004546,0.008649,-0.003129,-0.006338,0.009869,-1.0
152,0.004094,-0.000664,-0.003886,-0.000176,0.003750,0.004204,0.001244,0.001401,-0.005821,0.002305,...,0.003069,0.002882,-0.002296,0.003684,0.003156,0.007034,-0.002630,0.003268,-0.000688,-1.0
153,0.008161,-0.004431,0.008996,0.008260,-0.004433,0.000286,0.004288,-0.003907,-0.005568,-0.006529,...,-0.004011,-0.008236,0.006273,-0.001939,-0.000666,-0.001764,-0.004539,0.004059,-0.004265,-1.0


In [42]:
# Save feature vectors as csv
df_text.to_csv('negative.csv')

### Create Feature Vectors from Generated Data

In [43]:
vector_size = 100
w2v_model = Word2Vec(text, vector_size=vector_size)

In [44]:
vocab = w2v_model.wv.index_to_key
vocab_length = len(vocab)

# Note: Low vocab size is because Word2Vec model isn't familiar with most of the words in our dataset
# From https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
print(f'Vocabulary Size: {format(vocab_length)}')

Vocabulary Size: 1


In [45]:
vocab

['the']

In [46]:
vectors = [w2v_model.wv[word] for word in vocab]
len(vectors[0])

100

In [47]:
def sentence_to_vector_averaging(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in vocab]
    
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(embeddings, axis=0)

In [48]:
features = []

for sentence in sentences:
    features.append(sentence_to_vector_averaging(sentence, w2v_model))

In [49]:
columns = [f'x{i}' for i in range(w2v_model.vector_size)]

df_text = pd.DataFrame(features, columns=columns)

In [52]:
df_text['y'] = [-1] * len(df_text)
df_text

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x91,x92,x93,x94,x95,x96,x97,x98,x99,y
0,-0.000658,0.00029,0.00626,0.011051,-0.011411,-0.00873,0.007923,0.011006,-0.006152,-0.004616,...,0.000233,0.004261,0.000267,0.011799,0.006207,-0.010938,-0.008637,0.001106,0.007841,-1
1,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1
2,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1
3,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1
4,-0.000658,0.00029,0.00626,0.011051,-0.011411,-0.00873,0.007923,0.011006,-0.006152,-0.004616,...,0.000233,0.004261,0.000267,0.011799,0.006207,-0.010938,-0.008637,0.001106,0.007841,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,-0.000658,0.00029,0.00626,0.011051,-0.011411,-0.00873,0.007923,0.011006,-0.006152,-0.004616,...,0.000233,0.004261,0.000267,0.011799,0.006207,-0.010938,-0.008637,0.001106,0.007841,-1
151,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1
152,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1
153,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1


In [53]:
# Save feature vectors as csv
df_text.to_csv('negative_generated.csv')