In [1]:
from bs4 import BeautifulSoup
import requests
from db import PythonProblems
from datetime import datetime

### Creating and connecting to database

In [2]:
db = PythonProblems('python.sqlite', 'init_tables.sql')

### Getting the page to be crawled

In [3]:
url = "http://www.practicepython.org/"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html5lib')

### Getting problems and solutions link

In [4]:
listas = soup.find_all("h2", class_="midheader")

## Getting problems links
problems_links = []
problems = listas[0].parent.find_all("li")
for item in problems:
    try:
        partial_link = item.find_all("a")[0].get('href')
        problems_links.append(url + partial_link)
    except IndexError:
        pass

## Getting solutions links
solutions_links = []
solutions = listas[1].parent.find_all("li")
for item in solutions:
    try:
        partial_link = item.find_all("a")[0].get('href')
        solutions_links.append(url + partial_link)
    except IndexError:
        pass

### Problems

In [5]:
%%time
problems = []
for link in problems_links:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html5lib')
    # Get title and difficulty
    title = soup.find_all('h1', {'class': 'pagetitle'})[0].text.strip()
    difficulty = len(soup.find_all('img', {'class': 'chili'}))
    
    # Get problem
    header = soup.find_all('h2')
    problem_text = ""
    for nextNode in header:
        if 'Solution' in nextNode.text:
            nextNode = nextNode.findNext()
            while nextNode.name != 'h2':
                try:
                    tag_name = nextNode.name
                except AttributeError:
                    tag_name = ""
                if tag_name in ["p", "ol", "ul", "figure"]:
                    problem_text += nextNode.text + "\n"
                nextNode = nextNode.findNext()
    problems.append({"title": title, "difficulty": difficulty, "link": link,
                     "content": problem_text, "retrieved_date": datetime.now(),
                     "crawler": "PracticePython"})

CPU times: user 1.83 s, sys: 28.4 ms, total: 1.85 s
Wall time: 36.2 s


### Solutions

In [6]:
%%time
solutions = []
idx = 0
for link in solutions_links:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html5lib')
    scripts = soup.find_all('script')
    for item in scripts:
        if item.get('src') and 'github' in item.get('src'):
            gist_link = item.get('src')
            response = requests.get(gist_link)
            soup = BeautifulSoup(response.text, 'html5lib')
            try:
                raw_link = soup.find_all('a')[0].get('href')[2:-2]
                solution = requests.get(raw_link)
                solutions.append({"content": solution.text, "link": link,
                                  "retrieved_date": datetime.now(), "ignore": False,
                                 "idx": idx})
            except IndexError:
                solutions.append({"ignore": True, "link": link,
                                  "retrieved_date": datetime.now(), "idx": idx})
    idx += 1

CPU times: user 8.49 s, sys: 273 ms, total: 8.76 s
Wall time: 2min 22s


### Insert rows

In [7]:
db.insert_rows(problems, solutions)
tp, ts, rp, rs = db.populate()
db.close_connection()
print("Got %d problems inserted, %d solutions inserted, %d problems repeated and %d solutions repeated" % (
    tp, ts, rp, rs))

Got 35 problems inserted, 69 solutions inserted, 0 problems repeated and 1 solutions repeated
