In [1]:
import os
from urllib.parse import quote
from datetime import datetime
from db import PythonProblems
from bs4 import BeautifulSoup
import requests

### Creating and connecting to database

In [23]:
db = PythonProblems('python.sqlite', 'init_tables.sql')

#### 1. First crawler is for basic exercises, title, problem and solution come directly from the code

In [3]:
path = 'Basics-master'
folders = os.listdir(path)

In [4]:
%%time
folders.sort()
additional = []
problems = []
solutions = []
idx = 0
for folder in folders:
    if folder[0] in ['1', '2', '3']:
        folder_path = os.path.join(path+'/'+folder)
        files = os.listdir(folder_path)
        for file in files:
            file_path = os.path.join(folder_path+'/'+file)
            if os.path.isfile(file_path) and file[-2:] == 'py':
                with open(file_path) as f:
                    content = f.readlines()
                for line, text in enumerate(content):
                    # Line is comment and the problem is written
                    if text.strip() and text.strip()[0] != '#':
                        problem = content[:line]
                        solution = content[line:]
                        break
                problem_text = ' '.join(problem).replace('#','').strip()
                link = quote("https://github.com/pythonschool/Basics/tree/master/%s/%s" % (folder, file),
                             safe="%/:=&?~#+!$,;'@()*[]")
                problems.append({"title": problem[0].replace('#','').strip(), "difficulty": 1, "link": link,
                     "content": problem_text, "retrieved_date": datetime.now(),
                     "crawler": "PythonSchool"})
                solution_text = ' '.join(solution)
                solutions.append({"content": solution_text, "link": link,
                                  "retrieved_date": datetime.now(), "ignore": False,
                                 "idx": idx})
                idx +=1

CPU times: user 771 µs, sys: 0 ns, total: 771 µs
Wall time: 803 µs


In [5]:
db.insert_rows(problems, solutions)
tp, ts, rp, rs = db.populate()
#db.close_connection()
print("Got %d problems inserted, %d solutions inserted, %d problems repeated and %d solutions repeated" % (
    tp, ts, rp, rs))

Got 7 problems inserted, 7 solutions inserted, 0 problems repeated and 0 solutions repeated


### 2. Additional Exercises 

In [69]:
def get_page(url):
    response = requests.get(url, verify=False)
    data = response.text
    soup = BeautifulSoup(data, 'html5lib')
    return soup

def get_problems(soup, url, allowed_titles = ['exercises'], allowed_tags=['li']):
    problems = []
    listas = soup.find_all("h2")

    for nextNode in listas:
        for element in allowed_titles:
            if element in nextNode.text.lower():
                title = nextNode.text
                nextNode = nextNode.findNext()
                while nextNode.name != 'h2' and nextNode.name != 'div':
                    try:
                        tag_name = nextNode.name
                    except AttributeError:
                        tag_name = ""
                    if tag_name in allowed_tags:
                        problem_text = nextNode.text
                        problems.append({"title": title, "link": url,
                             "content": problem_text, "retrieved_date": datetime.now(),
                             "crawler": "PythonSchool"})
                    nextNode = nextNode.findNext()
    
    return problems

def get_solutions(problems, folder_partial):
    solutions = []
    folder = os.path.join(path, folder_partial)
    for idx in range(1, len(problems)+1):
        filename_partial = 'additional_exercise%d.py'%idx
        filename = os.path.join(folder, filename_partial)
        with open(filename) as f:
            content = f.readlines()
            for line, text in enumerate(content):
                # Line is comment. Skip it.
                if text.strip() and text.strip()[0] != '#':
                    solution = content[line:]
                    break
            link = quote("https://github.com/pythonschool/Basics/tree/master/%s/%s" % (folder_partial, 
                                                                                       filename_partial),
                                 safe="%/:=&?~#+!$,;'@()*[]")
            solution_text = ' '.join(solution)
            solutions.append({"content": solution_text, "link": link, "retrieved_date": datetime.now(), "ignore": False,
                              "idx": idx-1})
            
    return solutions

def insert(db, problems, solutions):
    db.insert_rows(problems, solutions)
    tp, ts, rp, rs = db.populate()
    print("Got %d problems inserted, %d solutions inserted, %d problems repeated and %d solutions repeated" % (
          tp, ts, rp, rs))

In [70]:
### Additional Exercises 1
url = "https://pythonschool.net/basics/string-operation-and-math-unit-exercises/"
soup = get_page(url)
problems = get_problems(soup, url)
folder_partial = '1 - Variables/Additional Exercises'
solutions = get_solutions(problems, folder_partial)
insert(db, problems, solutions)

### Had to fix some problem-solution correspondence by hand



Got 0 problems inserted, 0 solutions inserted, 17 problems repeated and 0 solutions repeated


In [71]:
### Additional Exercises 2
url = "https://pythonschool.net/basics/selection-exercises/"
soup = get_page(url)
problems = get_problems(soup, url)
folder_partial = '2 - Selection/Additional Exercises 2'
solutions = get_solutions(problems, folder_partial)
insert(db, problems, solutions)

Got 0 problems inserted, 0 solutions inserted, 10 problems repeated and 0 solutions repeated




In [72]:
### Additional Exercises 3
url = "https://pythonschool.net/basics/iteration-exercises/"
soup = get_page(url)
problems = get_problems(soup, url)
folder_partial = '3 - Iteration/Additional Exercises 3'
del problems[1]
del problems[1]
del problems[1]
solutions = get_solutions(problems, folder_partial)
insert(db, problems, solutions)



Got 0 problems inserted, 0 solutions inserted, 11 problems repeated and 0 solutions repeated


In [73]:
### Additional Exercises 4
url = "https://pythonschool.net/basics/lists-exercises/"
soup = get_page(url)
problems = get_problems(soup, url, ['exercises', 'quiz', 'menu'])
folder_partial = '4 - Lists/Additional exercises 4'
solutions = get_solutions(problems, folder_partial)
insert(db, problems, solutions)



Got 0 problems inserted, 0 solutions inserted, 9 problems repeated and 0 solutions repeated


In [74]:
### Additional Exercises 5
url = "https://pythonschool.net/basics/functions-exercises/"
soup = get_page(url)
#problems = get_problems(soup, url, ['task'])
folder_partial = '5 - Functions/Additional Exercises 5'
solutions = get_solutions(problems, folder_partial)
insert(db, problems, solutions)

Got 0 problems inserted, 0 solutions inserted, 9 problems repeated and 0 solutions repeated




In [104]:
### Additional Exercises 6
urls = ["https://pythonschool.net/basics/reading-from-a-file-part-2/",
        "https://pythonschool.net/basics/writing-to-a-file/",
        "https://pythonschool.net/basics/extending-the-quiz/"]
problems = []
for url in urls:
    soup = get_page(url)
    problems_temp = get_problems(soup, url, ['exercise', 'extension'], ['li', 'p'])
    problems.extend(problems_temp)

del problems[2]
del problems[2]
del problems[2]
del problems[2]

problem_temp = ''
for p in problems[4:]:
    problem_temp += p['content'] + '. '
    
problems[4]['content'] = problem_temp

problems = problems[:5]
folder_partial = '6 - Files/Exercise Solutions'
solutions = get_solutions(problems, folder_partial)
insert(db, problems, solutions)



Got 5 problems inserted, 5 solutions inserted, 0 problems repeated and 0 solutions repeated


In [105]:
db.close_connection()