In [1]:
#uncomment this line if you want to install package
#pip install -r requirements.txt


In [3]:
import selenium.common.exceptions
import urllib3.exceptions
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from time import sleep
import threading
import csv
import traceback
import multiprocessing

In [12]:
def split_list(lst, k):
    """
    This function is used to split a list to sub-lists that have k items.

    Args:
        lst (list): input list
        k (int): number of item of a sub-list

    Returns:
        list: list of sub-list
    For example:
    split_list([1,2,3,4,5],2) => [[1,2],[3,4],[5]]
    """
    size = len(lst)
    return [lst[i:i+k] for i in range(0, size, k)]


print(split_list(["hi", 2, 3, 4, 5], 3))
print([1,2,3,4,5,6][:-1])


[['hi', 2, 3], [4, 5]]
[1, 2, 3, 4, 5]


In [13]:
def get_first_not_null_item(lsts):
    """
      this function is used to get the first not null item of a list, it there is no not-null item,
      this func returns the last item
      
    Args:
        lsts (list): input list

    Returns:
        string: the first not null item
    """
    for l in lsts:
        if (len(l)!=0):
            return l
    return lsts[-1]
print(get_first_not_null_item(['','1','']))

1


In [14]:
def get_detail_project(page, url,error_url_file):
    """
    This func is used to crawl data of a detail project page of Kickstarter website

    Args:
        page (int): page index of url
        url (string): link to website that need to be crawled
        error_url_file (string): name of file that contains a list of error urls that can not be crawled (
            this list will be executed later
        )
    """
    browser = webdriver.Chrome(
        service=(Service(ChromeDriverManager().install()))
    )
    try:
        print("crawl url: ")
        print(url)
        browser.get(url)
        # sleep(2)
        # t = title.text
        wait = WebDriverWait(browser, 10)
        # title = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'h2.type-28.type-24-md.soft-black.mb1.project-name')))
        title = get_first_not_null_item(list(map(lambda a:a.text,browser.find_elements(
            By.CSS_SELECTOR, "h2.type-24-md.soft-black.mb1.project-name"))))
        description = get_first_not_null_item(list(map(lambda a:a.text,browser.find_elements(
            By.CSS_SELECTOR, "p[class='type-14 type-18-md soft-black project-description mb1']"))))
        picture = browser.find_element(
            By.CSS_SELECTOR, "img[class='aspect-ratio--object bg-black z3']").get_attribute("src")
        pledged = get_first_not_null_item(list(map(lambda a:a.text,browser.find_elements(
            By.CSS_SELECTOR, "span[class='ksr-green-500']"))))
        # goal = get_first_not_null_item(list(map(lambda a:a.text,browser.find_elements(By.CSS_SELECTOR, "span[class='inline-block-sm hide']")[
        #     1].find_element(By.CSS_SELECTOR, "span[class='money']"))))
        span_goal = browser.find_elements(By.CSS_SELECTOR, "span[class='inline-block-sm hide']")
        n_goal = []
        for i in span_goal:
            goals = i.find_elements(By.CSS_SELECTOR, "span[class='money']")
            for goal in goals:
                n_goal.append(goal.text)
        goal = get_first_not_null_item(n_goal)
        backers = get_first_not_null_item(list(map(lambda a:a.text,browser.find_elements(
            By.CSS_SELECTOR, "div[class='block type-16 type-28-md bold dark-grey-500']"))))
        days_to_go = get_first_not_null_item(list(map(lambda a:a.text,browser.find_elements(
            By.CSS_SELECTOR, "span[class='block type-16 type-28-md bold dark-grey-500']"))))
        file = open("./data/result.txt","a")
        file.write(url+"\n")
        file.write(str([title, description, picture, pledged, goal, backers, days_to_go])+"\n")
        file.close()
    except:
        error_url = str(page)+","+url+"\n"
        error_url_file_obj = open(error_url_file, "a")
        error_url_file_obj.write(error_url)
        error_url_file_obj.close()
        traceback.print_exc()
    browser.close()
# get_data(current_page=current_page)
#get_detail_project(0,"https://www.kickstarter.com/projects/mlspencer/dragon-mage-deluxe-collectors-edition-hardcover")


In [5]:
def get_data(url,current_page,num_of_thread,error_url_file,checkpoint_file):
    """
    This func is used to crawl data from Kickstarter website (
        https://www.kickstarter.com/discover/advanced?woe_id=0&sort=magic&seed=2811224&page=
    )

    Args:
        url (string): link to website
        current_page (int): current page index of page that is being crawled
        num_of_thread (int): num of crawling thread 
        error_url_file (string): name of file that contains a list of error_url_file that can not be crawled
        checkpoint_file (string): name of file that contains information about the index of page that is being crawled
    """
    page = current_page
    while (1):
        browser = webdriver.Chrome(
        service=(Service(ChromeDriverManager().install()))
    )
        meta_url = url+str(page)
        print("meta_url: ")
        print(meta_url)
        try:
            browser.get(meta_url)
            sleep(2)
            links = list(map(lambda a: a.get_attribute("href"),
                                 browser.find_elements(By.CSS_SELECTOR,
                                                       "a[class='block img-placeholder w100p']")
                                 ))
            prj_links = [l for l in links if l.endswith("?ref=discovery")]
            print("prj_links: ")
            for l in prj_links:
                print(l)
            print(len(prj_links))
            threads = []
            split_prj_links = split_list(prj_links, num_of_thread)
            last_prj_links = split_prj_links[-1]
            print("split_prj_links: ")
            print(split_prj_links)
            for links in split_prj_links[:-1]:
                threads = [threading.Thread(
                        target=get_detail_project, args=(current_page, link,error_url_file)) for link in links]
                for thread in threads:
                    thread.start()
                for thread in threads:
                    thread.join()
                threads = []
            threads = [threading.Thread(
                    target=get_detail_project, args=(current_page, link,error_url_file)) for link in last_prj_links]
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join()
            threads = []
            page = page+1
        except:
            file = open(checkpoint_file, "w")
            file.write(str({"page": page}))
            traceback.print_exc()
            break
        browser.close()


In [6]:
# link to the website Kickstarter
url = 'https://www.kickstarter.com/discover/advanced?woe_id=0&sort=magic&seed=2811224&page='

# get the current page
checkpoint = eval(open('./data/checkpoint.csv', "r").readline())
current_page = checkpoint["page"]
error_url_file = 'error_url.csv'
print(current_page)

# start to crawl
chrome_options = Options()
chrome_options.add_experimental_option('detach',True)
get_data(current_page=current_page,url=url,num_of_thread=4,checkpoint_file="./data/checkpoint.csv",error_url_file="./data/error_url.csv")


2
meta_url: 
https://www.kickstarter.com/discover/advanced?woe_id=0&sort=magic&seed=2811224&page=2


Traceback (most recent call last):
  File "e:\software\anaconda\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
    httplib_response = self._make_request(
  File "e:\software\anaconda\lib\site-packages\urllib3\connectionpool.py", line 449, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "e:\software\anaconda\lib\site-packages\urllib3\connectionpool.py", line 444, in _make_request
    httplib_response = conn.getresponse()
  File "e:\software\anaconda\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "e:\software\anaconda\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "e:\software\anaconda\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "e:\software\anaconda\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing conne

### test some function

In [7]:
print("geats")
browser = webdriver.Chrome(
        service=(Service(ChromeDriverManager().install()))
    )
meta_url = url+str(4)
browser.get(meta_url)
sleep(1)
browser.close()
browser = webdriver.Chrome(
        service=(Service(ChromeDriverManager().install()))
    )
browser.get(url+str(5))
sleep(1)
# get_data(current_page=current_page)
# get_detail_project(0,"https://www.kickstarter.com/projects/mlspencer/dragon-mage-deluxe-collectors-edition-hardcover")


geats


In [8]:
# f = open(file,"a")
# f.writelines("hello")
# f.writelines("hi")
# f.close()
# f = open(file,"r")
# l = list(map(lambda a:a.strip().split(","),f.readlines()))
# print(l)
print("hello")
x = ''
print(len(x))
def task(x):
    print("multi thread")
    print(x * x)
data = [[1,2],[2,3],[4,5]]
inputs = [100,2,3,4,5,7]

res = [i for i in inputs if i % 2==0]
print(res)
threads =[]
# for m in inputs:
#     thread = threading.Thread(target=task,args=[m])
#     threads.append(thread)
threads = [threading.Thread(target=task,args=[m]) for m in inputs]
for thread in threads:
    thread.start()
    thread.join()
# pool.close()
#pool.join()
#print("results: {}".format(results))


hello
0
[100, 2, 4]
multi thread
10000
multi thread
4
multi thread
9
multi thread
16
multi thread
25
multi thread
49


In [9]:
print("Hello world")
for i in range(0, 5):
    print(i)


Hello world
0
1
2
3
4
