<a href="https://colab.research.google.com/github/HDWilliams/PositiveProse/blob/master/PositiveProseV1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SyntaxError: ignored

In [0]:
# Web Scraper for Poetry Foundations.Org
# Instructions 
# 1. Go to https://www.poetryfoundation.org/poems/browse
# 2. Choose your Filters
# 3. Copy the url. That will be your parameter for the function

# Here is an example of me scraping poems based
# off a TOPIC filter of "Love" and a SCHOOL/PERIOD filter of 1951-Present

# You will modify these in order to control the parameters of the script!
BROWSE_FIRST_URL = "https://www.poetryfoundation.org/poems/browse#page=1&sort_by=recently_added&topics=20&school-period=1951-present"
JUST_TESTING = False # set to false once you actually decide to parse all the results

from google.colab import files
import requests
import json

from bs4 import BeautifulSoup
from math import ceil



# set request headers to get around bot blocking technology

s = requests.session()
s.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'})

def main():
  scrape_poems(BROWSE_FIRST_URL)
  

def scrape_poems(url):
  print("Scraping Poems...")
  # A default id meant for testing if you want to comment out parse_ids
  ids = [149702]
  
  # scrape
  ids = parse_ids(url)
  poems = parse_poems(ids)
  
  # convert to json and save off
  
  filename = "poems.json"
  with open(filename, "w") as outfile:
    json.dump(poems,outfile)
    

# get all the IDs for all the poems of the selected filters
def parse_ids(url):
  id_arr = []
  # convert url into request URL to get a JSON of Poems
  # Example of a request URL https://www.poetryfoundation.org/ajax/poems?page=1&sort_by=recently_added&topics=20&school-period=1951-present
  request_url=url.replace("poems/browse#","ajax/poems?")
  response=s.get(request_url)
  response_json = response.json()

  # there are 20 results per page
  total_poems = response_json["TotalResults"]
  num_pages = ceil(total_poems / 20)

  # keep max pages for testing to reduce load
  # test 2 pages to ensure this is actually working
  if JUST_TESTING:
    num_pages = 1

  for page_num in range(1, num_pages + 1):
    old_page_str = "page=" + str(page_num - 1)
    new_page_str = "page=" + str(page_num)
    request_url = request_url.replace(old_page_str,new_page_str)

    response=s.get(request_url)
    poem_entries = response.json()["Entries"]
    entries_ids = [entry["id"] for entry in poem_entries]
    id_arr.extend(entries_ids)

  # verify that the number of ids is equal to what we were promised
  print("Obtained " + str(len(id_arr)) + " poem ids of total " + str(total_poems))
  return id_arr

# access every poem and scrape the poetry content
def parse_poems(id_arr):
  count = 1
  base_url = "https://www.poetryfoundation.org/poems/"
  poems = {}
  print("Parsing poem "),
  for id in id_arr:
    print(count),
    count = count+1
    poem_url = base_url + str(id)
    
    
    #we dont care about what's happening so just skip failures
    try:
      response = s.get(poem_url, allow_redirects=True)
      soup = BeautifulSoup(response.text, "html.parser")
      title_box = soup.find("div", attrs={"class":"c-feature-hd"})
      title = title_box.text.strip()

      poem_box = soup.find("div", attrs={"class":"o-poem"})

      #Rohun look at what happens if I print poem_box.text
      poems[title] = poem_box.text.strip()

      # NOTE: IF YOU CHOOSE TO UNCOMMENT THESE PRINT STATEMENTS 
      # THEN PLEASE COMMENT OUT THE FUNCTION CALL TO parse_id

      # print(title + " : " + poems[title])
      # print(title)
      # print(poems)
    except:
      print("Failure on poem " + str(id) + ". Continuing....")
    
  return poems
    
if __name__ == '__main__': main()

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')


with open('/content/gdrive/My Drive/foo.txt', 'w') as f:
  f.write('Hello Google Drive!')
!cat /content/gdrive/My\ Drive/foo.txt

#Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code
#Enter your authorization code:
#··········
#Mounted at /content/gdrive

In [0]:
# POEM CLEANUP AND OTHER PRE-PROCESSING

import re
from google.colab import files

pattern = re.compile('([^\s\w]|_)+') 

# clean Poems of weird characters
filename = "poems.json"
newFilename = "compiledPoems.json"
new_data = {}

with open(filename, "r") as infile:
  data = json.load(infile)
 
for title, poem_string in data.items():
  strippedPoem = re.sub('[^A-Za-z0-9" "]+', '', poem_string)
  strippedTitle = re.sub('[^A-Za-z0-9" "]+', '', title)
  new_data[strippedTitle]=strippedPoem

with open(newFilename, "w") as outfile:
  json.dump(new_data,outfile)
 
files.download(newFilename)

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

''' Continuous bag-of-words model for word2vec.
Parameters:
    vocab_size: number of defined words in the vocab
    embedding_dim: desired embedded vector dimension
    context_size: number of context words used
'''
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = torch.mean(self.embeddings(inputs), dim=0).view((1, -1))
        out = self.linear(embeds)
        log_probs = F.log_softmax(out)
        return log_probs


''' Skip-gram bag-of-words model for word2vec.
Parameters:
    vocab_size: number of defined words in the vocab
    embedding_dim: desired embedded vector dimension
'''
class SkipGram(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = self.linear(embeds)
        log_probs = F.log_softmax(out)
        return log_probs


In [0]:
### example of running Skipgram model embeddings
loss_function = nn.NLLLoss()
model = SkipGram(len(vocab), EMBEDDING_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.001)

print("Starting training")
for epoch in range(NUM_EPOCHS):
    total_loss = torch.Tensor([0])
    print("Beginning epoch %d" % epoch)
    progress_bar = progressbar.ProgressBar()
    for context, target in progress_bar(training_data):
        context_var = autograd.Variable(torch.LongTensor(context))
        model.zero_grad()
        log_probs = model(context_var)
        loss = loss_function(log_probs, autograd.Variable(
            torch.LongTensor([target])))
        loss.backward()
        optimizer.step()
        total_loss += loss.data
    print("Epoch %d Loss: %.5f" % (epoch, total_loss[0]))
    losses.append(total_loss)

# Visualize embeddings
if EMBEDDING_DIM == 2:
    indices = np.random.choice(np.arange(len(vocab)), size=10, replace=False)
    for ind in indices:
        word = list(vocab.keys())[ind]
        input = autograd.Variable(torch.LongTensor([word_to_ix[word]]))
        vec = model.embeddings(input).data[0]
        x, y = vec[0], vec[1]
        plt.scatter(x, y)
        plt.annotate(word, xy=(x, y), xytext=(5, 2),
                     textcoords='offset points', ha='right', va='bottom')
    plt.savefig("w2v.png")

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Long Short Term Memory RNN Model
#One input layer, two hidden layers, connected to a Linear fully connected layer as the output
#have the option to use drop out, and bidirectional LSTM with dropout

class LSTM(nn.Module):
  def __init__ (self, input_dim, hidden_dim, output_dim, num_layers, batch_size, W2V_model, droprate = .2, nonlin='relu', bidirect=False):
    super(LSTM, self).__init__()
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim
    self.num_layers = num_layers
    self.batch_size = batch_size
    self.W2V = W2V_model
    
    #define LSTM module (two layers)
    self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers, dropout=droprate, nonlinearity=nonlin, bidirectional=bidirect)
    
    #define fully connected output layer
    self.linear = nn.Linear(self.hidden_layer, self.output_dim)
    
    #initialize the hidden layer with random weights
    def init_hidden_state(self):
      return (torch.rand(self.num_layers, self.batch_size, self.hidden_dim), torch.rand(self.num_layers, self.batch_size, self.hidden_dim))
    
    #define forward pass through the model
    def forward(self, init_input, hidden, ):
      #get output of lstm and then pass it to fully connected layer
      vec_words = self.W2V(init_input)
      lstm_output, self.hidden = self.lstm(vec_words, hidden)
      y_pred = self.linear(lstm_output[-1].view(self.batch_size, -1))
      return y_pred.view(-1)
      
