In [1]:
from bs4 import BeautifulSoup
import requests
from db import PythonProblems
from datetime import datetime
from urllib.parse import urljoin
from tqdm import tqdm

### Creating and connecting to database

In [14]:
db = PythonProblems('python.sqlite', 'init_tables.sql')

### Getting the pages to be crawled

In [15]:
%%time
urls = ["https://www.w3resource.com/python-exercises/python-basic-exercises.php", 
        "https://www.w3resource.com/python-exercises/basic/",
        "https://www.w3resource.com/python-exercises/string/"]
pms = []
for idx, url in enumerate(urls):
    response = requests.get(url)
    data = response.text
    soup = BeautifulSoup(data, 'html5lib')
    solution_links, problem_comparison, problems = get_problems(soup)
    
    if idx == 0:
        # Bug-fix, problem is written wrong in the first page
        problems[145]["content"] = "Write a Python program to test if a variable is a list or tuple or a set."
    
    solutions, potential_mismatch = get_solutions(solution_links, problem_comparison)
    pms.append(potential_mismatch)
    insert(problems, solutions)
#db.close_connection()

100%|██████████| 153/153 [00:00<00:00, 6498.38it/s]
151it [01:21,  2.02it/s]


Got 150 problems inserted, 150 solutions inserted, 1 problems repeated and 0 solutions repeated


100%|██████████| 19/19 [00:00<00:00, 4621.96it/s]
17it [00:08,  2.08it/s]

Got 17 problems inserted, 17 solutions inserted, 0 problems repeated and 0 solutions repeated
CPU times: user 16.9 s, sys: 324 ms, total: 17.3 s
Wall time: 1min 32s





## Functions

### Problems and solutions links

In [3]:
def get_problems(soup):
    exercises = soup.find_all("strong")
    problem_list = []
    solution_links = []
    problem_comparison = []

    title = soup.find_all("h2", class_="heading")[0].text.split("[")[0].strip()

    for e in tqdm(exercises):
        # Find solutions links
        solution_link = e.findNext("a")
        try:
            while solution_link.text != "Click me to see the sample solution":
                solution_link = solution_link.findNext("a")
        except AttributeError:
            continue
        solution_links.append(solution_link)

        # Get problem
        problem_text = ""
        problem_text = e.parent.text
        if e.parent.next_sibling and e.parent.next_sibling.name == "pre":
            problem_text += e.parent.next_sibling.text

        # Get original text for comparison in solution's page
        editor_partition = problem_text.partition("Go to the editor")
        #Go to the editor is written
        if editor_partition[2] != '':
            problem_comparison.append(editor_partition[0].split('.')[1].strip())
        # Go to the editor is not written
        else:
            problem_comparison.append(problem_text.split('.')[1])

        problem_text = problem_text.replace("Go to the editor", "")
        problem_text = problem_text.replace("Click me to see the sample solution", "")
        problem_text = problem_text.partition('.')[2].strip()
        problem_list.append(problem_text)

    problems = []

    for problem_text in problem_list:
        problems.append({"title": title, "link": url,
                         "content": problem_text, "retrieved_date": datetime.now(),
                         "crawler": "W3Resource"})
        
    return solution_links, problem_comparison, problems

### Solutions

In [4]:
def get_solutions(solution_links, problem_comparison):
    solutions = []
    potential_mismatch = []
    for idx, link in tqdm(enumerate(solution_links)):
        link_sol = urljoin(url, link['href'])
        response = requests.get(link_sol)
        soup = BeautifulSoup(response.text, 'html5lib')

        # Compare problem inside with original problem
        compare_problem = soup.find_all("h2")[0].findNext().text
        if compare_problem[:-1].strip() != problem_comparison[idx].strip():
            potential_mismatch.append(compare_problem[:-1].strip() + '\n' + problem_comparison[idx] + '\n\n')

        code = soup.find_all("code")
        solutions.append({"content": code[0].text, "link": link_sol,
                          "retrieved_date": datetime.now(), "ignore": False,
                          "idx": idx})
    return solutions, potential_mismatch

### Insert rows

In [5]:
def insert(problems, solutions):
    db.insert_rows(problems, solutions)
    tp, ts, rp, rs = db.populate()
    print("Got %d problems inserted, %d solutions inserted, %d problems repeated and %d solutions repeated" % (
          tp, ts, rp, rs))

### Control check

In [12]:
for item in pms[1]:
    print(item)

Write a Python program to create all possible strings by using 'a', 'e', 'i', 'o', 'u'. Use the characters exactly once
Write a Python program to create all possible strings by using 'a', 'e', 'i', 'o', 'u'


Write a Python program to count the number of each character of a text file
Write a Python program to count the number of each character of a given text of a text file


Write a Python program to check the sum of three elements (each from an array) from three arrays is equal to a target value. Print all those three-element combinations
Write a Python program to check the sum of three elements (each from an array) from three arrays is equal to a target value




In [16]:
db.rows

[{'problem': {'content': 'Write a Python function that takes a sequence of numbers and determines if all the numbers are different from each other.',
   'crawler': 'W3Resource',
   'link': 'https://www.w3resource.com/python-exercises/basic/',
   'retrieved_date': datetime.datetime(2017, 11, 30, 22, 42, 46, 339567),
   'title': 'Python basic (Part-II)'},
  'solution': {'content': 'def test_distinct(data):\n  count = 0\n  for k in data:\n    for j in range(1, len(data) - 1):\n      if k == j:\n        count += 1\n        if count == 2:\n          return False\n          return True\nprint(test_distinct([2,4,5,7,9]))\nprint(test_distinct([2,4,5,5,7,9]))\n',
   'ignore': False,
   'link': 'https://www.w3resource.com/python-exercises/basic/python-basic-1-exercise-1.php',
   'problem_id': 351,
   'retrieved_date': datetime.datetime(2017, 11, 30, 22, 42, 47, 202034)}},
 {'problem': {'content': "Write a Python program to create all possible strings by using 'a', 'e', 'i', 'o', 'u'. Use the cha