In [44]:
from scrapy import Selector

# XPath Crash Course

In [45]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="master.css">
    <link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=League+Gothic&display=swap" rel="stylesheet">
</head>
<body>

    <h1> This is a H1 heading. </h1>

    <p class="p_class"> Paragraph right after H1 tag. </p>

    <p id="list_paragraph"> Let's see a list </p>

    <ol>
        <li> Item One </li>
        <li> Item Two </li>
        <li> Item Three </li>
    </ol>

    <h3> H3 heading is here. </h3>
    <div class="firstDiv">
        <p>I'm inside the first div.  </p>
        <p id="find_me"> Find me </p>
    </div>

    <div class="secondDiv">
        <p>I'm inside the second div.  </p>

    </div>

    <p id="unique_p"> Unique text outside both divs. </p>
    <p class="p_class other_class"> Example text </p>



    <h4 class="other_class"> This is heading 4. </h4>

    <p> Paragraph outside the div. </p>
    <div>
        <p class = "p_class"> Paragraph inside the div. </p>
        <p id="find_me"> Find me again </p>
        <p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>

    </div>

</body>
</html>
"""

## XPath Practice

In [46]:
# Finding "Find me" text
xpath = '/html/body/div[1]/p[2]'
sel = Selector(text=html)
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>']

In [47]:
# Find all list items
xpath = '/html/body/ol//li'
sel.xpath(xpath).extract()

['<li> Item One </li>', '<li> Item Two </li>', '<li> Item Three </li>']

In [48]:
# Find all paragraphs inside div[3]
xpath = '/html/body/div[3]//p'
sel.xpath(xpath).extract()

['<p class="p_class"> Paragraph inside the div. </p>',
 '<p id="find_me"> Find me again </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [49]:
# Finding all first P elements
xpath = '//p[1]'
sel.xpath(xpath).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 "<p>I'm inside the first div.  </p>",
 "<p>I'm inside the second div.  </p>",
 '<p class="p_class"> Paragraph inside the div. </p>']

In [50]:
# XPath wildcard is used to ignore tag type
xpath = '/html/body/*' #finding all children of the body element
sel.xpath(xpath).extract()

['<h1> This is a H1 heading. </h1>',
 '<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p id="list_paragraph"> Let\'s see a list </p>',
 '<ol>\n        <li> Item One </li>\n        <li> Item Two </li>\n        <li> Item Three </li>\n    </ol>',
 '<h3> H3 heading is here. </h3>',
 '<div class="firstDiv">\n        <p>I\'m inside the first div.  </p>\n        <p id="find_me"> Find me </p>\n    </div>',
 '<div class="secondDiv">\n        <p>I\'m inside the second div.  </p>\n\n    </div>',
 '<p id="unique_p"> Unique text outside both divs. </p>',
 '<p class="p_class other_class"> Example text </p>',
 '<h4 class="other_class"> This is heading 4. </h4>',
 '<p> Paragraph outside the div. </p>',
 '<div>\n        <p class="p_class"> Paragraph inside the div. </p>\n        <p id="find_me"> Find me again </p>\n        <p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>\n\n    </div>']

## XPath Attributes

In [51]:
# Finding all p elements that have class = "p_class"
xpath = '//p[@class="p_class"]'
sel.xpath(xpath).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p class="p_class"> Paragraph inside the div. </p>']

In [85]:
# Find all elements who have id = "find_me"
xpath = '//*[@id="find_me"]'
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']

In [86]:
# Find Find_me using attributes
xpath = '//div[@class="firstDiv"]/p[2]'
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>']

## XPath Attributes Contains Function

In [89]:
# Finding all elements whose class is set to class="other_class" using contains function
xpath = '//*[contains(@class, "other_class")]'
sel.xpath(xpath).extract()

['<p class="p_class other_class"> Example text </p>',
 '<h4 class="other_class"> This is heading 4. </h4>']

In [91]:
# Finding the class name by pulling attribute using xpath
xpath = '/html/body/p[1]/@class'
sel.xpath(xpath)

[<Selector xpath='/html/body/p[1]/@class' data='p_class'>]

# Setting up a Selector Object

In [52]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="master.css">
    <link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=League+Gothic&display=swap" rel="stylesheet">
</head>
<body>

    <h1> This is a H1 heading. </h1>

    <p class="p_class"> Paragraph right after H1 tag. </p>

    <p id="list_paragraph"> Let's see a list </p>

    <ol>
        <li> Item One </li>
        <li> Item Two </li>
        <li> Item Three </li>
    </ol>

    <h3> H3 heading is here. </h3>
    <div class="firstDiv">
        <p>I'm inside the first div.  </p>
        <p id="find_me"> Find me </p>
    </div>

    <div class="secondDiv">
        <p>I'm inside the second div.  </p>

    </div>

    <p id="unique_p"> Unique text outside both divs. </p>
    <p class="p_class other_class"> Example text </p>



    <h4 class="other_class"> This is heading 4. </h4>

    <p> Paragraph outside the div. </p>
    <div>
        <p> Paragraph inside the div. </p>
        <p id="find_me"> Find me again </p>
        <p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>

    </div>

</body>
</html>
"""

In [53]:
sel = Selector(text = html)

In [54]:
sel.xpath('//p')

[<Selector xpath='//p' data='<p class="p_class"> Paragraph right a...'>,
 <Selector xpath='//p' data='<p id="list_paragraph"> Let\'s see a l...'>,
 <Selector xpath='//p' data="<p>I'm inside the first div.  </p>">,
 <Selector xpath='//p' data='<p id="find_me"> Find me </p>'>,
 <Selector xpath='//p' data="<p>I'm inside the second div.  </p>">,
 <Selector xpath='//p' data='<p id="unique_p"> Unique text outside...'>,
 <Selector xpath='//p' data='<p class="p_class other_class"> Examp...'>,
 <Selector xpath='//p' data='<p> Paragraph outside the div. </p>'>,
 <Selector xpath='//p' data='<p> Paragraph inside the div. </p>'>,
 <Selector xpath='//p' data='<p id="find_me"> Find me again </p>'>,
 <Selector xpath='//p' data='<p>Second paragraph inside the div wi...'>]

In [55]:
sel.xpath('//p').extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p id="list_paragraph"> Let\'s see a list </p>',
 "<p>I'm inside the first div.  </p>",
 '<p id="find_me"> Find me </p>',
 "<p>I'm inside the second div.  </p>",
 '<p id="unique_p"> Unique text outside both divs. </p>',
 '<p class="p_class other_class"> Example text </p>',
 '<p> Paragraph outside the div. </p>',
 '<p> Paragraph inside the div. </p>',
 '<p id="find_me"> Find me again </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [56]:
sel.xpath('//p').extract_first()

'<p class="p_class"> Paragraph right after H1 tag. </p>'

In [57]:
p_sel_list = sel.xpath('//p')

In [58]:
p_sel_list[-1].extract()

'<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>'

In [59]:
print( "You have found: ", len( sel.xpath('//*') ) )

You have found:  31


# Setting up Selector with Python requests library

In [60]:
import requests

In [61]:
url = "https://en.wikipedia.org/wiki/Python"
html = requests.get(url).content

In [62]:
sel = Selector(text=html)

In [63]:
sel.xpath('//h2').extract()

['<h2 id="mw-toc-heading">Contents</h2>',
 '<h2><span class="mw-headline" id="Computing">Computing</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Python&amp;action=edit&amp;section=1" title="Edit section: Computing">edit</a><span class="mw-editsection-bracket">]</span></span></h2>',
 '<h2><span class="mw-headline" id="People">People</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Python&amp;action=edit&amp;section=2" title="Edit section: People">edit</a><span class="mw-editsection-bracket">]</span></span></h2>',
 '<h2><span class="mw-headline" id="Roller_coasters">Roller coasters</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Python&amp;action=edit&amp;section=3" title="Edit section: Roller coasters">edit</a><span class="mw-editsection-bracket">]</span></span></h2>',
 '<h2><span class="mw-headline" id="Vehi

In [64]:
print( "You have found: ", len( sel.xpath('//*') ) )

You have found:  597


# CSS Locator

In [65]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="master.css">
    <link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=League+Gothic&display=swap" rel="stylesheet">
</head>
<body>

    <h1> This is a H1 heading. </h1>

    <p class="p_class"> Paragraph right after H1 tag. </p>

    <p id="list_paragraph"> Let's see a list </p>

    <ol>
        <li> Item One </li>
        <li> Item Two </li>
        <li> Item Three </li>
    </ol>

    <h3> H3 heading is here. </h3>
    <div class="firstDiv">
        <p>I'm inside the first div.  </p>
        <p id="find_me"> Find me </p>
    </div>

    <div class="secondDiv">
        <p>I'm inside the second div.  </p>

    </div>

    <p id="unique_p"> Unique text outside both divs. </p>
    <p class="p_class other_class"> Example text </p>



    <h4 class="other_class"> This is heading 4. </h4>

    <p> Paragraph outside the div. </p>
    <div>
        <p> Paragraph inside the div. </p>
        <p id="find_me"> Find me again </p>
        <p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>

    </div>

</body>
</html>
"""

## Rosetta CSStone

In [66]:
xpath = '/html/body/h1'

In [67]:
sel = Selector(text=html)

In [68]:
sel.xpath(xpath).extract()

['<h1> This is a H1 heading. </h1>']

In [69]:
css = 'html > body > h1'

In [70]:
sel.css(css).extract()

['<h1> This is a H1 heading. </h1>']

In [71]:
xpath = '//p'

In [72]:
sel.xpath(xpath).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p id="list_paragraph"> Let\'s see a list </p>',
 "<p>I'm inside the first div.  </p>",
 '<p id="find_me"> Find me </p>',
 "<p>I'm inside the second div.  </p>",
 '<p id="unique_p"> Unique text outside both divs. </p>',
 '<p class="p_class other_class"> Example text </p>',
 '<p> Paragraph outside the div. </p>',
 '<p> Paragraph inside the div. </p>',
 '<p id="find_me"> Find me again </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [73]:
css = 'p'

In [74]:
sel.css(css).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p id="list_paragraph"> Let\'s see a list </p>',
 "<p>I'm inside the first div.  </p>",
 '<p id="find_me"> Find me </p>',
 "<p>I'm inside the second div.  </p>",
 '<p id="unique_p"> Unique text outside both divs. </p>',
 '<p class="p_class other_class"> Example text </p>',
 '<p> Paragraph outside the div. </p>',
 '<p> Paragraph inside the div. </p>',
 '<p id="find_me"> Find me again </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [75]:
xpath = '//p[3]'

In [76]:
sel.xpath(xpath).extract()

['<p id="unique_p"> Unique text outside both divs. </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [77]:
xpath = '/html/body/div[1]/p[2]'

In [78]:
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>']

In [79]:
css = 'html > body > div:nth-of-type(1) > p:nth-of-type(2)'

In [80]:
sel.css(css).extract()

['<p id="find_me"> Find me </p>']

In [81]:
xpath = '/html/body//div/p[2]'
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']

In [82]:
css = 'html > body div > p:nth-of-type(2)'
sel.css(css).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']

In [83]:
css = 'div.firstDiv > p#find_me'
sel.css(css).extract()

['<p id="find_me"> Find me </p>']

In [84]:
css = 'p#find_me'
sel.css(css).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']