In [3]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import time
import datetime
import pandas as pd
import lxml
import re
from IPython.display import display, Markdown

In [4]:
columns = ["id", "title", "subtitle", "content", "html_content", "release_date", "solved_by_count", "difficulty"]

def getFullProblemSet():
    curr_problem = 1
    url = "https://projecteuler.net/problem=1"

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    browser.visit(url)

    #add a delay so page fully loads
    time.sleep(4)

    problems = []
    df = pd.DataFrame(columns=columns)

    while(browser.url.find("archive") == -1):
        print(curr_problem)
        current_problem = browser.html
        
        try:
            news_soup = bs(current_problem, "html.parser")

            title = news_soup.find_all("h2")[0].get_text()
            subtitle = news_soup.find_all("h3")[0].get_text()
            content = news_soup.find_all("div", class_="problem_content")[0].get_text()
            tooltip = news_soup.find_all(class_="tooltiptext_right")[0].get_text()

            # split tooltip text into separate features
            # datetime (may need to set UTC as timezone? Or a different timezone?)
            datetime_str = re.findall("Published on (.*[ap]m);", tooltip)
    #         print(datetime_str)
            datetime_str = re.sub("(?<=\d)[a-z][a-z]", "", datetime_str[0])
            datetime_str = re.sub("(?<= )(\d)(?= )", "0\g<0>", datetime_str)
            datetime_str = datetime_str.replace("pm", "PM")
            datetime_str = datetime_str.replace("am", "AM")
    #         print(datetime_str)
            datetime_val = datetime.datetime.strptime(datetime_str, "%A, %d %B %Y, %I:%M %p")

            # Solved By (count)
            solved_by_count = re.findall("Solved by (\d+)", tooltip)

            # Difficulty Rating
            difficulty = re.findall("(\d+)%", tooltip)
            if(len(difficulty) == 0):
                difficulty = [float('nan')]
                
            # get problem html content
            html_url = "https://projecteuler.net/minimal={}".format(curr_problem)
            browser.visit(html_url)
            
            time.sleep(1)
            curr_html = bs(browser.html, "html.parser")
            
            html_content = curr_html.find_all("pre")[0].get_text()
            # make img tags use absolute paths
            html_content = re.sub(r'<img src="(?!http)', '<img src="https://projecteuler.net/', html_content)
            # make anchor tags use absolute paths
            html_content = re.sub(r'<a href="(?!http)', '<a href="https://projecteuler.net/', html_content)
            print(html_content)

            curr_row = pd.Series({"id":curr_problem, "title":title, "subtitle":subtitle, "content":content, "html_content":html_content, "release_date":datetime_val.isoformat(), "solved_by_count":int(solved_by_count[0]), "difficulty":int(difficulty[0])})
            df = pd.concat([df, curr_row.to_frame().T], ignore_index=True)
#         print(curr_row)
        except:
            pass
        curr_problem += 1
        
        url = "https://projecteuler.net/problem={}".format(curr_problem)
        browser.visit(url)
        time.sleep(3)

    # song to know when scraping done
    browser.visit("https://www.youtube.com/watch?v=kln_bIndDJg")
    
    return df


In [5]:


test_df = getFullProblemSet()
test_df

312
<p>- A <b>Sierpiński graph</b> of order-1 (<var>S</var><sub>1</sub>) is an equilateral triangle.<br />
- <var>S</var><sub><var>n</var>+1</sub> is obtained from <var>S</var><sub><var>n</var></sub> by positioning three copies of <var>S</var><sub><var>n</var></sub> so that every pair of copies has one common corner.
</p>

<div align="center"><img src="https://projecteuler.net/project/images/p312_sierpinskyAt.gif" class="dark_img" alt="p312_sierpinskyAt.gif" /></div>

<p>Let C(<var>n</var>) be the number of cycles that pass exactly once through all the vertices of <var>S</var><sub><var>n</var></sub>.<br />
For example, C(3) = 8 because eight such cycles can be drawn on <var>S</var><sub>3</sub>, as shown below:
</p>

<div align="center"><img src="https://projecteuler.net/project/images/p312_sierpinsky8t.gif" class="dark_img" alt="p312_sierpinsky8t.gif" /></div>

<p>It can also be verified that :<br />
C(1) = C(2) = 1<br />
C(5) = 71328803586048<br />
C(10 000) mod 10<sup>8</sup> = 37652

Unnamed: 0,id,title,subtitle,content,html_content,release_date,solved_by_count,difficulty
0,312,Multiples of 3 or 5,Problem 1,\nIf we list all the natural numbers below 10 ...,<p>- A <b>Sierpiński graph</b> of order-1 (<va...,2001-10-05T18:00:00,970035,5
1,313,Sliding game,Problem 313,\nIn a sliding game a counter may slide horizo...,<p>In a sliding game a counter may slide horiz...,2010-12-05T04:00:00,1646,30
2,314,The Mouse on the Moon,Problem 314,"\n\nThe moon has been opened up, and land can ...","<p>\nThe moon has been opened up, and land can...",2010-12-12T07:00:00,521,80
3,315,Digital root clocks,Problem 315,\n\nSam and Max are asked to transform two dig...,"<p></p><div align=""center""><img src=""https://p...",2010-12-19T10:00:00,3477,20


In [11]:
display(Markdown(test_df["html_content"][2]))

<p>
The moon has been opened up, and land can be obtained for free, but there is a catch. You have to build a wall around the land that you stake out, and building a wall on the moon is expensive. Every country has been allotted a 500 m by 500 m square area, but they will possess only that area which they wall in. 251001 posts have been placed in a rectangular grid with 1 meter spacing. The wall must be a closed series of straight lines, each line running from post to post.
</p>
<p>
The bigger countries of course have built a 2000 m wall enclosing the entire 250 000 m<sup>2</sup> area. The <a href="http://en.wikipedia.org/wiki/Grand_Fenwick">Duchy of Grand Fenwick</a>, has a tighter budget, and has asked you (their Royal Programmer) to compute what shape would get best maximum enclosed-area/wall-length ratio.
</p>
<p>
You have done some preliminary calculations on a sheet of paper.
For a 2000 meter wall enclosing the 250 000 m<sup>2</sup> area the
enclosed-area/wall-length ratio is 125.<br />
Although not allowed , but to get an idea if this is anything better:  if you place a circle inside the square area touching the four sides the area will be equal to π*250<sup>2</sup> m<sup>2</sup> and the perimeter will be π*500 m, so the enclosed-area/wall-length ratio will also be 125.
</p>
<p>
However, if you cut off from the square four triangles with sides 75 m, 75 m and 75√2 m the total area becomes 238750 m<sup>2</sup> and the perimeter becomes 1400+300√2 m. So this gives an enclosed-area/wall-length ratio of 130.87, which is significantly better.
</p>
<div align="center"><img src="https://projecteuler.net/project/images/p314_landgrab.gif" class="dark_img" alt="p314_landgrab.gif" /></div>
<p>
Find the maximum enclosed-area/wall-length ratio.<br />
Give your answer rounded to 8 places behind the decimal point in the form abc.defghijk.
</p>





In [10]:
curr_date = datetime.datetime.now().strftime("%m-%d-%y")
test_df.to_csv(f"./{curr_date}.csv", index=False)

# Outline of PE Scraping Process

- navigate to PE problem 1 (https://projecteuler.net/problem=1)
- scrape webpage from browser using beautifulsoup
- add problem information to dataframe
- increment problem number
- repeat navigation, scrape, and data append for each problem until returned to the main archives page (https://projecteuler.net/archives)
- output data to a file (.csv)
- should take ~1 hour for 800 problems (given a 5 second page load time)

## To Do
- ~~Fix problem descriptions that use MathJax (https://docs.mathjax.org/en/v1.1-latest/mathjax.html) notation (highest priority because it directly affects problem description clarity and readability, which directly impacts the dataset's quality as a problem repository)~~
- Add data files for problems that use external files (like Poker Hands)
  - include images as well as problem data files
  - currently uses links, make a separate save folder
- ~~Add links for problems that use external information (like Roman Numerals)~~
- Add a notebook to update the problem set file (only add new problems since older problems do not change often)
- Can also rescrape entire notebook since number of solvers for a problem should increase over time? (may want to add versioning using a date/timestamp, or can just use git)