In [239]:
from scrapy import Selector, Request
import scrapy
from scrapy.http import HtmlResponse

# XPath Crash Course

In [171]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="master.css">
    <link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=League+Gothic&display=swap" rel="stylesheet">
</head>
<body>

    <h1> This is a H1 heading. </h1>

    <p class="p_class"> Paragraph right after H1 tag. </p>

    <p id="list_paragraph"> Let's see a list </p>

    <ol>
        <li> Item One </li>
        <li> Item Two </li>
        <li> Item Three </li>
    </ol>

    <h3> H3 heading is here. </h3>
    <div class="firstDiv">
        <p>I'm inside the first div.  </p>
        <p id="find_me"> Find me </p>
    </div>

    <div class="secondDiv">
        <p>I'm inside the second div.  </p>

    </div>

    <p id="unique_p"> Unique text outside both divs. </p>
    <p class="p_class other_class"> Example text </p>



    <h4 class="other_class"> This is heading 4. </h4>

    <p> Paragraph outside the div. </p>
    <div>
        <p class = "p_class"> Paragraph inside the div. </p>
        <p id="find_me"> Find me again </p>
        <p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>

    </div>

</body>
</html>
"""

## XPath Practice

In [172]:
# Finding "Find me" text
xpath = '/html/body/div[1]/p[2]'
sel = Selector(text=html)
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>']

In [173]:
# Find all list items
xpath = '/html/body/ol//li'
sel.xpath(xpath).extract()

['<li> Item One </li>', '<li> Item Two </li>', '<li> Item Three </li>']

In [174]:
# Find all paragraphs inside div[3]
xpath = '/html/body/div[3]//p'
sel.xpath(xpath).extract()

['<p class="p_class"> Paragraph inside the div. </p>',
 '<p id="find_me"> Find me again </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [175]:
# Finding all first P elements
xpath = '//p[1]'
sel.xpath(xpath).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 "<p>I'm inside the first div.  </p>",
 "<p>I'm inside the second div.  </p>",
 '<p class="p_class"> Paragraph inside the div. </p>']

In [176]:
# XPath wildcard is used to ignore tag type
xpath = '/html/body/*' #finding all children of the body element
sel.xpath(xpath).extract()

['<h1> This is a H1 heading. </h1>',
 '<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p id="list_paragraph"> Let\'s see a list </p>',
 '<ol>\n        <li> Item One </li>\n        <li> Item Two </li>\n        <li> Item Three </li>\n    </ol>',
 '<h3> H3 heading is here. </h3>',
 '<div class="firstDiv">\n        <p>I\'m inside the first div.  </p>\n        <p id="find_me"> Find me </p>\n    </div>',
 '<div class="secondDiv">\n        <p>I\'m inside the second div.  </p>\n\n    </div>',
 '<p id="unique_p"> Unique text outside both divs. </p>',
 '<p class="p_class other_class"> Example text </p>',
 '<h4 class="other_class"> This is heading 4. </h4>',
 '<p> Paragraph outside the div. </p>',
 '<div>\n        <p class="p_class"> Paragraph inside the div. </p>\n        <p id="find_me"> Find me again </p>\n        <p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>\n\n    </div>']

## XPath Attributes

In [177]:
# Finding all p elements that have class = "p_class"
xpath = '//p[@class="p_class"]'
sel.xpath(xpath).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p class="p_class"> Paragraph inside the div. </p>']

In [178]:
# Find all elements who have id = "find_me"
xpath = '//*[@id="find_me"]'
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']

In [179]:
# Find Find_me using attributes
xpath = '//div[@class="firstDiv"]/p[2]'
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>']

## XPath Attributes Contains Function

In [180]:
# Finding all elements whose class is set to class="other_class" using contains function
xpath = '//*[contains(@class, "other_class")]'
sel.xpath(xpath).extract()

['<p class="p_class other_class"> Example text </p>',
 '<h4 class="other_class"> This is heading 4. </h4>']

In [181]:
# Finding the class name by pulling attribute using xpath
xpath = '/html/body/p[1]/@class'
sel.xpath(xpath)

[<Selector xpath='/html/body/p[1]/@class' data='p_class'>]

# Setting up a Selector Object

In [182]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="master.css">
    <link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=League+Gothic&display=swap" rel="stylesheet">
</head>
<body>

    <h1> This is a H1 heading. </h1>

    <p class="p_class"> Paragraph right after H1 tag. </p>

    <p id="list_paragraph"> Let's see a list </p>

    <ol>
        <li> Item One </li>
        <li> Item Two </li>
        <li> Item Three </li>
    </ol>

    <h3> H3 heading is here. </h3>
    <div class="firstDiv">
        <p>I'm inside the first div.  </p>
        <p id="find_me"> Find me </p>
    </div>

    <div class="secondDiv">
        <p>I'm inside the second div.  </p>

    </div>

    <p id="unique_p"> Unique text outside both divs. </p>
    <p class="p_class other_class"> Example text </p>



    <h4 class="other_class"> This is heading 4. </h4>

    <p> Paragraph outside the div. </p>
    <div>
        <p> Paragraph inside the div. </p>
        <p id="find_me"> Find me again </p>
        <p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>

    </div>

</body>
</html>
"""

In [183]:
sel = Selector(text = html)

In [184]:
sel.xpath('//p')

[<Selector xpath='//p' data='<p class="p_class"> Paragraph right a...'>,
 <Selector xpath='//p' data='<p id="list_paragraph"> Let\'s see a l...'>,
 <Selector xpath='//p' data="<p>I'm inside the first div.  </p>">,
 <Selector xpath='//p' data='<p id="find_me"> Find me </p>'>,
 <Selector xpath='//p' data="<p>I'm inside the second div.  </p>">,
 <Selector xpath='//p' data='<p id="unique_p"> Unique text outside...'>,
 <Selector xpath='//p' data='<p class="p_class other_class"> Examp...'>,
 <Selector xpath='//p' data='<p> Paragraph outside the div. </p>'>,
 <Selector xpath='//p' data='<p> Paragraph inside the div. </p>'>,
 <Selector xpath='//p' data='<p id="find_me"> Find me again </p>'>,
 <Selector xpath='//p' data='<p>Second paragraph inside the div wi...'>]

In [185]:
sel.xpath('//p').extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p id="list_paragraph"> Let\'s see a list </p>',
 "<p>I'm inside the first div.  </p>",
 '<p id="find_me"> Find me </p>',
 "<p>I'm inside the second div.  </p>",
 '<p id="unique_p"> Unique text outside both divs. </p>',
 '<p class="p_class other_class"> Example text </p>',
 '<p> Paragraph outside the div. </p>',
 '<p> Paragraph inside the div. </p>',
 '<p id="find_me"> Find me again </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [186]:
sel.xpath('//p').extract_first()

'<p class="p_class"> Paragraph right after H1 tag. </p>'

In [187]:
p_sel_list = sel.xpath('//p')

In [188]:
p_sel_list[-1].extract()

'<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>'

In [189]:
print( "You have found: ", len( sel.xpath('//*') ) )

You have found:  31


# Setting up Selector with Python requests library

In [190]:
import requests

In [191]:
url = "https://en.wikipedia.org/wiki/Python"
html = requests.get(url).content

In [192]:
sel = Selector(text=html)

In [193]:
sel.xpath('//h2').extract()

['<h2 id="mw-toc-heading">Contents</h2>',
 '<h2><span class="mw-headline" id="Computing">Computing</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Python&amp;action=edit&amp;section=1" title="Edit section: Computing">edit</a><span class="mw-editsection-bracket">]</span></span></h2>',
 '<h2><span class="mw-headline" id="People">People</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Python&amp;action=edit&amp;section=2" title="Edit section: People">edit</a><span class="mw-editsection-bracket">]</span></span></h2>',
 '<h2><span class="mw-headline" id="Roller_coasters">Roller coasters</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Python&amp;action=edit&amp;section=3" title="Edit section: Roller coasters">edit</a><span class="mw-editsection-bracket">]</span></span></h2>',
 '<h2><span class="mw-headline" id="Vehi

In [194]:
print( "You have found: ", len( sel.xpath('//*') ) )

You have found:  597


# CSS Locator

In [195]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="master.css">
    <link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=League+Gothic&display=swap" rel="stylesheet">
</head>
<body>

    <h1> This is a H1 heading. </h1>

    <p class="p_class"> Paragraph right after H1 tag. </p>

    <p id="list_paragraph"> Let's see a list </p>

    <ol>
        <li> Item One </li>
        <li> Item Two </li>
        <li> Item Three </li>
    </ol>

    <h3> H3 heading is here. </h3>
    <div class="firstDiv">
        I am inside the div.
        <p>I'm inside the first div.  </p>
        <p id="find_me"> Find me </p>
        <a  href="www.google.com" > Google </a>
    </div>

    <div class="secondDiv">
        <p>I'm inside the second div.  </p>

    </div>

    <p id="unique_p"> Unique text outside both divs. </p>
    <p class="p_class other_class"> Example text </p>



    <h4 class="other_class"> This is heading 4. </h4>

    <p> Paragraph outside the div. </p>
    <div>
        <p> Paragraph inside the div. </p>
        <p id="find_me"> Find me again </p>
        <p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>

    </div>

</body>
</html>
"""

## Rosetta CSStone

In [196]:
xpath = '/html/body/h1'

In [197]:
sel = Selector(text=html)

In [198]:
sel.xpath(xpath).extract()

['<h1> This is a H1 heading. </h1>']

In [199]:
css = 'html > body > h1'

In [200]:
sel.css(css).extract()

['<h1> This is a H1 heading. </h1>']

In [201]:
xpath = '//p'

In [202]:
sel.xpath(xpath).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p id="list_paragraph"> Let\'s see a list </p>',
 "<p>I'm inside the first div.  </p>",
 '<p id="find_me"> Find me </p>',
 "<p>I'm inside the second div.  </p>",
 '<p id="unique_p"> Unique text outside both divs. </p>',
 '<p class="p_class other_class"> Example text </p>',
 '<p> Paragraph outside the div. </p>',
 '<p> Paragraph inside the div. </p>',
 '<p id="find_me"> Find me again </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [203]:
css = 'p'

In [204]:
sel.css(css).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p id="list_paragraph"> Let\'s see a list </p>',
 "<p>I'm inside the first div.  </p>",
 '<p id="find_me"> Find me </p>',
 "<p>I'm inside the second div.  </p>",
 '<p id="unique_p"> Unique text outside both divs. </p>',
 '<p class="p_class other_class"> Example text </p>',
 '<p> Paragraph outside the div. </p>',
 '<p> Paragraph inside the div. </p>',
 '<p id="find_me"> Find me again </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [205]:
xpath = '//p[3]'

In [206]:
sel.xpath(xpath).extract()

['<p id="unique_p"> Unique text outside both divs. </p>',
 '<p>Second paragraph inside the div with some text <span> inside a span.</span>  </p>']

In [207]:
xpath = '/html/body/div[1]/p[2]'

In [208]:
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>']

In [209]:
css = 'html > body > div:nth-of-type(1) > p:nth-of-type(2)'

In [210]:
sel.css(css).extract()

['<p id="find_me"> Find me </p>']

In [211]:
xpath = '/html/body//div/p[2]'
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']

In [212]:
css = 'html > body div > p:nth-of-type(2)'
sel.css(css).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']

In [213]:
css = 'div.firstDiv > p#find_me'
sel.css(css).extract()

['<p id="find_me"> Find me </p>']

In [214]:
css = 'p#find_me'
sel.css(css).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']

In [215]:
# Finding all elements with class = "p_class" even if other classes are present
css = '.p_class'
sel.css(css).extract()

['<p class="p_class"> Paragraph right after H1 tag. </p>',
 '<p class="p_class other_class"> Example text </p>']

In [216]:
# Find all children of firstDiv class elements
css = '.firstDiv > *'
sel.css(css).extract()

["<p>I'm inside the first div.  </p>",
 '<p id="find_me"> Find me </p>',
 '<a href="www.google.com"> Google </a>']

## Attribute and Text Selection

### Attribute Selection

In [217]:
# XPath: find all href attributes in whom come from elements with firstDiv class
xpath = '//div[@class="firstDiv"]/a/@href'
sel.xpath(xpath).extract()

['www.google.com']

In [218]:
# CSS: find all href attributes in whom come from elements with firstDiv class
css = 'div.firstDiv > a::attr(href)'
sel.css(css).extract()

['www.google.com']

### Text Selection

In [219]:
# XPath Text extraction for all children of element who has class = "firstDiv"
xpath = '//div[@class="firstDiv"]/text()'
sel.xpath(xpath).extract()

['\n        I am inside the div.\n        ',
 '\n        ',
 '\n        ',
 '\n    ']

In [220]:
# XPath full text extraction for all children of element who has class = "firstDiv"
xpath = '//div[@class="firstDiv"]//text()'
sel.xpath(xpath).extract()

['\n        I am inside the div.\n        ',
 "I'm inside the first div.  ",
 '\n        ',
 ' Find me ',
 '\n        ',
 ' Google ',
 '\n    ']

In [221]:
# Using XPath to find findme text
xpath = '//*[@id="find_me"]/text()'
sel.xpath(xpath).extract()

[' Find me ', ' Find me again ']

In [222]:
# Using XPath to find findme
xpath = '//*[@id="find_me"]'
sel.xpath(xpath).extract()

['<p id="find_me"> Find me </p>', '<p id="find_me"> Find me again </p>']

In [223]:
# Using CSS to find text
css = 'p#find_me::text'
sel.css(css).extract()

[' Find me ', ' Find me again ']

In [224]:
# Using CSS to find text part 2
css = 'div.firstDiv::text'
sel.css(css).extract()

['\n        I am inside the div.\n        ',
 '\n        ',
 '\n        ',
 '\n    ']

In [225]:
css = 'div.firstDiv ::text'
sel.css(css).extract()

['\n        I am inside the div.\n        ',
 "I'm inside the first div.  ",
 '\n        ',
 ' Find me ',
 '\n        ',
 ' Google ',
 '\n    ']

# Response Objects

In [242]:
response  = Request(url='https://en.wikipedia.org/wiki/Python')