In [2]:
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

MODE = 'normal' # OR 'headless'
BLOCK_IMAGES = True
BLOCK_JS = False
DRIVER_PATH = '/Users/hadi/Documents/workspace/daneshkar/week 11 (project sraping)/shared/selenium/chromedriver'
service = Service(executable_path=DRIVER_PATH)

if MODE == 'headless':
    options = webdriver.ChromeOptions()
    options.headless = True
    options.add_argument("--window-size=1920,1200")
elif MODE == 'normal':
    options = webdriver.ChromeOptions()

if BLOCK_IMAGES or BLOCK_JS:
    ### This blocks images and javascript requests
    block_dict = {}
    if BLOCK_IMAGES:
        block_dict['images'] = 2
    if BLOCK_JS:
        block_dict['javascript'] = 2
    chrome_prefs = {
        "profile.default_content_setting_values": block_dict
    }
    options.experimental_options["prefs"] = chrome_prefs

driver = webdriver.Chrome(service=service, options=options)
driver.set_page_load_timeout(10)

In [4]:
def open_link(link, driver=driver):
    try:
        driver.get(link)
    except TimeoutException:
        pass

open_link('https://techcrunch.com')

## Handler Functions

In [5]:
def scroll_to_bottom(driver=driver):
    javaScript = "window.scrollBy(0, 100000);"
    driver.execute_script(javaScript)

## Get the list of all categories

In [6]:
categories_links_path = '//header[contains(@class, "site-navigation")]//ul[contains(@class, "menu")]/li[@class="menu__item"]/a'
categories_links = driver.find_elements(By.XPATH, categories_links_path)
main_categories_list = [(link.get_attribute('href'), link.text) for link in categories_links if '/category/' in link.get_attribute('href')]
main_categories_list

[('https://techcrunch.com/category/startups/', 'Startups'),
 ('https://techcrunch.com/category/venture/', 'Venture'),
 ('https://techcrunch.com/category/security/', 'Security'),
 ('https://techcrunch.com/category/artificial-intelligence/', 'AI'),
 ('https://techcrunch.com/category/cryptocurrency/', 'Crypto'),
 ('https://techcrunch.com/category/apps/', 'Apps')]

In [7]:
more_categories_btn = '//header[contains(@class, "site-navigation")]//ul[contains(@class, "menu")]/li[@class="menu__item more-link"]/a'
more_btn = driver.find_element(By.XPATH, more_categories_btn)
more_btn.click()

In [8]:
more_categories_links_path = '//header[contains(@class, "site-navigation")]//div[@class="desktop-nav navigation-desktop__flyout"]//li[@class="menu__item"]/a'
more_categories_links = driver.find_elements(By.XPATH, more_categories_links_path)
more_categories_list = [(link.get_attribute('href'), link.text) for link in more_categories_links if '/category/' in link.get_attribute('href')]
more_categories_list

[]

In [9]:
all_categories_list = main_categories_list + more_categories_list
all_categories_list

[('https://techcrunch.com/category/startups/', 'Startups'),
 ('https://techcrunch.com/category/venture/', 'Venture'),
 ('https://techcrunch.com/category/security/', 'Security'),
 ('https://techcrunch.com/category/artificial-intelligence/', 'AI'),
 ('https://techcrunch.com/category/cryptocurrency/', 'Crypto'),
 ('https://techcrunch.com/category/apps/', 'Apps')]

## Scrape the categories pages

In [10]:
all_data_together = list()

def save_all_data_together(data_list):
    all_data_together.append(data_list)
    # print('$$$', len(all_data_together))
    # df_list.append({"writer":writer, "title":title, "publiser":publiser, "country":country, "ISBN":ISBN})


In [11]:
def process_article_header(article_header):
    def get_article_header_type():
        try:
            article_category = article_header.find_element(By.XPATH, './div[@class="article__primary-category"]/a')
            return {
                'type': types['article_category'], 
                'text': article_category.text, 
                'href': article_category.get_attribute('href'),
            }
        except:
            try:
                article_label = article_header.find_element(By.XPATH, './div[@class="featured-article__label"]/div[contains(@class, "featured-article__label__text")]')
                return {
                    'type': types['article_label'], 
                    'text': article_label.text, 
                    'href': article_label.get_attribute('href'),
                }
            except:
                article_event_title = article_header.find_element(By.XPATH, './h3[@class="article__event-title"]/a')
                return {
                    'type': types['article_event'], 
                    'text': article_event_title.text, 
                    'href': article_event_title.get_attribute('href'),
                }
            
    types = {'article_category': 'Category', 'article_label': 'Label', 'article_event': 'Event'}
    title = article_header.find_element(By.XPATH, './h2[@class="post-block__title"]').text
    # //div[contains(@class, "river")]/div//article/header//div[@class="post-block__meta"]//span[@class="river-byline__authors"]//a
    author_name_el = article_header.find_element(By.XPATH, './/div[@class="post-block__meta"]//span[@class="river-byline__authors"]//a')
    author_name, author_link = author_name_el.text, author_name_el.get_attribute('href')
    # //div[contains(@class, "river")]/div//article/header//div[@class="post-block__meta"]//span[@class="river-byline__full-date-time__wrapper"]//time
    date_and_time = article_header.find_element(By.XPATH, './/div[@class="post-block__meta"]//div[@class="river-byline__full-date-time__wrapper"]//time').get_attribute('datetime')
    article_canonical_link = article_header.find_element(By.XPATH, './h2[@class="post-block__title"]/a').get_attribute('href')
    return {
        'title': title,
        'article_link': article_canonical_link,
        'header': get_article_header_type(),
        'author_name': author_name,
        'author_link': author_link,
        'date_and_time': date_and_time,
    }


def crawl_category_page(category_page_url, already_scraped_articles_num=0):
    if already_scraped_articles_num == 0:
        open_link(category_page_url)
    else:
        scroll_to_bottom()
    category_river_div_path = '//div[contains(@class, "river")]/div'
    river_div = driver.find_element(By.XPATH, category_river_div_path)
    articles_elements = river_div.find_elements(By.XPATH, '//article')
    articles_elements = articles_elements[already_scraped_articles_num:]
    for article in articles_elements:
        article_header = article.find_element(By.XPATH, './header')
        save_all_data_together(process_article_header(article_header))
    print('$$$', len(all_data_together))

In [12]:
def get_number_of_current_articles_in_page():
    category_river_div_path = '//div[contains(@class, "river")]/div'
    river_div = driver.find_element(By.XPATH, category_river_div_path)
    articles_elements = river_div.find_elements(By.XPATH, '//article')
    return len(articles_elements)

In [13]:
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def click_load_more():
    load_more_btn_xpath = '//*[@id="tc-main-content"]//button[contains(@class, "load-more")]'
    load_more_btn = driver.find_element(By.XPATH, load_more_btn_xpath)
    load_more_btn.click()
    time.sleep(10)
    # while True:
    #     try:
    # element = WebDriverWait(driver, 20).until(
    #     EC.presence_of_element_located((By.XPATH, load_more_btn_xpath))
    # )
            # return
        # except:
        #     pass

In [14]:
for i, (cat_link, cat_name) in enumerate(all_categories_list):
    already_scraped_articles_num = 0
    while True:
        crawl_category_page(cat_link, already_scraped_articles_num)
        already_scraped_articles_num = get_number_of_current_articles_in_page()
        print(already_scraped_articles_num)
        scroll_to_bottom()
        # try:
        click_load_more()
        # except:
        #     print('NO LOAD MORE BUTTON')
        # if input('CONTINUE LAZY LOADING?') == 'y':
        #     continue
        # else:
        #     break

$$$ 22
22
$$$ 42
42
$$$ 60
60
$$$ 80
80
$$$ 100
100
$$$ 119
119
$$$ 139
139
$$$ 154
154
$$$ 174
174
$$$ 194
194
$$$ 214
214
$$$ 229
229
$$$ 249
249
$$$ 269
269
$$$ 284
284
$$$ 304
304
$$$ 324
324
$$$ 344
344
$$$ 364
364
$$$ 384
384
$$$ 399
399
$$$ 419
419
$$$ 434
434
$$$ 454
454
$$$ 474
474
$$$ 489
489
$$$ 509
509
$$$ 529
529
$$$ 549
549
$$$ 569
569
$$$ 589
589
$$$ 609
609
$$$ 624
624
$$$ 644
644
$$$ 664
664
$$$ 684
684
$$$ 704
704
$$$ 724
724
$$$ 744
744
$$$ 764
764
$$$ 784
784
$$$ 804
804
$$$ 824
824
$$$ 844
844
$$$ 864
864
$$$ 884
884
$$$ 904
904
$$$ 924
924
$$$ 944
944
$$$ 962
962
$$$ 982
982
$$$ 1002
1002
$$$ 1022
1022
$$$ 1042
1042
$$$ 1062
1062
$$$ 1082
1082
$$$ 1102
1102
$$$ 1122
1122
$$$ 1142
1142
$$$ 1160
1160
$$$ 1180
1180
$$$ 1200
1200
$$$ 1219
1219
$$$ 1239
1239
$$$ 1259
1259
$$$ 1279
1279
$$$ 1296
1296
$$$ 1316
1316
$$$ 1336
1336
$$$ 1356
1356
$$$ 1376
1376
$$$ 1396
1396
$$$ 1416
1416
$$$ 1436
1436
$$$ 1456
1456
$$$ 1476
1476
$$$ 1496
1496
$$$ 1516
1516
$$$ 1536
1536
$$$ 

TimeoutException: Message: timeout: Timed out receiving message from renderer: 10.000
  (Session info: chrome=123.0.6312.122)
Stacktrace:
0   chromedriver                        0x00000001046f841c chromedriver + 4326428
1   chromedriver                        0x00000001046f08e4 chromedriver + 4294884
2   chromedriver                        0x000000010431c088 chromedriver + 278664
3   chromedriver                        0x0000000104306314 chromedriver + 189204
4   chromedriver                        0x0000000104306018 chromedriver + 188440
5   chromedriver                        0x0000000104303e60 chromedriver + 179808
6   chromedriver                        0x0000000104304930 chromedriver + 182576
7   chromedriver                        0x00000001043133b0 chromedriver + 242608
8   chromedriver                        0x0000000104326504 chromedriver + 320772
9   chromedriver                        0x0000000104304edc chromedriver + 184028
10  chromedriver                        0x00000001043262bc chromedriver + 320188
11  chromedriver                        0x000000010439743c chromedriver + 783420
12  chromedriver                        0x00000001043534e4 chromedriver + 505060
13  chromedriver                        0x0000000104353f5c chromedriver + 507740
14  chromedriver                        0x00000001046bb9b8 chromedriver + 4078008
15  chromedriver                        0x00000001046c0770 chromedriver + 4097904
16  chromedriver                        0x00000001046a255c chromedriver + 3974492
17  chromedriver                        0x00000001046c1088 chromedriver + 4100232
18  chromedriver                        0x0000000104693b4c chromedriver + 3914572
19  chromedriver                        0x00000001046e1690 chromedriver + 4232848
20  chromedriver                        0x00000001046e180c chromedriver + 4233228
21  chromedriver                        0x00000001046f0558 chromedriver + 4293976
22  libsystem_pthread.dylib             0x0000000195cb6f94 _pthread_start + 136
23  libsystem_pthread.dylib             0x0000000195cb1d34 thread_start + 8


In [1]:
import pandas as pd

df = pd.DataFrame({
    'type': row['header']['type'],
    'type_text': row['header']['text'],
    'type_link': row['header']['href'],
    'title': row['title'],
    'article_link': row['article_link'],
} for row in all_data_together)

NameError: name 'all_data_together' is not defined

In [None]:
df.to_csv('techcrunch.csv')

## Other

In [None]:
driver.save_screenshot('screenshot.png')

True

In [None]:
from selenium.webdriver import EC

try:
    element = webdriver.WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "mySuperId"))
    )
finally:
    driver.quit()

In [None]:
javaScript = "window.scrollBy(0, 100000);"
driver.execute_script(javaScript)

In [None]:
# Install the Python selenium-wire library:
# pip install selenium-wire
from seleniumwire import webdriver

proxy_username = "USER_NAME"
proxy_password = "PASSWORD"
proxy_url = "http://proxy.scrapingbee.com"
proxy_port = 8886

options = {
    "proxy": {
        "http": f"http://{proxy_username}:{proxy_password}@{proxy_url}:{proxy_port}",
        "verify_ssl": False,
    },
}

URL = "https://httpbin.org/headers?json"

driver = webdriver.Chrome(
    executable_path="YOUR-CHROME-EXECUTABLE-PATH",
    seleniumwire_options=options,
)
driver.get(URL)