# Wykład 7 - Web Scraping

## Spiders
## Generators
## Selectors

---
## Lista obecności
## http://bit.ly/SKNSwyklad0403 

---

## Zadanie 1

Narysuj wykres o kształcie okręgu

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

circle=plt.Circle((0,0),2)
ax=plt.gca()
ax.add_patch(circle)

plt.axis('scaled')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = np.linspace(-1, 1, 100)

plt.plot(x,np.sqrt(1-x*x))
plt.plot(x,-np.sqrt(1-x*x))

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = np.linspace(-1, 1, 100)

plt.plot(x,np.sqrt(1-x*x))
plt.plot(x,-np.sqrt(1-x*x))
plt.axis('scaled')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot circle or radius 3

an = np.linspace(0, 2*np.pi, 100)

plt.subplot(331)
plt.plot(3*np.cos(an), 3*np.sin(an))
plt.title('not equal, looks like ellipse', fontsize=10)

plt.subplot(333)
plt.plot(3*np.cos(an), 3*np.sin(an))
plt.axis('equal')
plt.title('equal, looks like circle', fontsize=10)

plt.subplot(337)
plt.plot(3*np.cos(an), 3*np.sin(an))
plt.axis('equal')
plt.axis([-3, 3, -3, 3])
plt.title('looks like circle, even after changing limits', fontsize=10)

plt.subplot(339)
plt.plot(3*np.cos(an), 3*np.sin(an))
plt.axis('equal')
plt.axis([-3, 3, -3, 3])
plt.plot([0, 3], [0, 3])
plt.title('still equal after adding line', fontsize=10)

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

an = np.linspace(0, 2*np.pi, 100)

plt.plot(3*np.cos(an)*1, 3*np.sin(an)*1)
plt.axis('equal');

In [None]:
import matplotlib.pyplot as plt
import numpy as np

an = np.linspace(0, 5*2*np.pi, 5*100)

plt.plot(3*np.cos(an)*an, 3*np.sin(an)*an)
plt.axis('equal');

In [None]:
import matplotlib.pyplot as plt
import numpy as np

an = np.linspace(0, 5*2*np.pi, 5*100)

plt.polar(an, an);

In [None]:
import matplotlib.pyplot as plt
import numpy as np

an = np.linspace(0, 5*2*np.pi, 5*100)

plt.polar(an, an, 'r+');

https://matplotlib.org/3.1.3/tutorials/introductory/sample_plots.html

---

## Web Scraping

### Skrypt scrapy1.py

```python
import scrapy
from scrapy.crawler import CrawlerProcess

class PythonEventsSpider(scrapy.Spider):
    name = 'pythoneventsspider'

    start_urls = ['https://www.python.org/events/python-events/',]
    found_events = []

    def parse(self, response):
        for event in response.xpath('//ul[contains(@class, "list-recent-events")]/li'):
            event_details = dict()
            event_details['name'] = event.xpath('h3[@class="event-title"]/a/text()').extract_first()
            event_details['location'] = event.xpath('p/span[@class="event-location"]/text()').extract_first()
            event_details['time'] = event.xpath('p/time/text()').extract_first()
            self.found_events.append(event_details)
            
print("Scrapy Example 1")
process = CrawlerProcess({ 'LOG_LEVEL': 'ERROR'})
process.crawl(PythonEventsSpider)
spider = next(iter(process.crawlers)).spider
process.start()

for event in spider.found_events: 
    print(event)
    
```

```python
import scrapy

class BlogSpider(scrapy.Spider):
    name = 'blogspider'
    start_urls = ['https://blog.scrapinghub.com']

    def parse(self, response):
        for title in response.css('.post-header>h2'):
            yield {'title': title.css('a ::text').get()}

        for next_page in response.css('a.next-posts-link'):
            yield response.follow(next_page, self.parse)
```

In [None]:
> scrapy runspider myspider.py

### *Generatory* są mechanizmem
* tworzenia iteratorów
* Zwraca dane przez *yield*
* Każde wywołanie _next()_ zaczyna od miejsca gdzie skończył poprzedni krok
* _next()_ tworzona jest automatycznie

In [None]:
range(5, -1, -1)

In [None]:
list(range(5, -1, -1))

In [None]:
for i in range(5, -1, -1):
    print(i)

In [None]:
def reverse(data):
    for index in range(len(data)-1, -1, -1):
        #print(index)
        yield data[index]

In [None]:
for c in reverse('Python'):
    print (c)

In [None]:
def reverse(data):
    for index in range(len(data)-1, -1, -1):
        print(index)
        yield data[index]

In [None]:
for c in reverse('Python'):
    print (c)

## Selectors

- Parsel https://parsel.readthedocs.io/en/latest/
  - Using XPath https://www.w3schools.com/xml/xpath_intro.asp
  - Using CSS https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors
- Beautiful Soup https://www.crummy.com/software/BeautifulSoup/bs4/doc/

![HTML Tree](img/pobrane.png "Html Tree")

In [None]:
from scrapy.selector import Selector

body = '<html><body><span>good</span></body></html>'
Selector(text=body).xpath('//span/text()').get()

In [None]:
from parsel import Selector

sel = Selector(text=u"""<html>
        <body>
            <h1>Hello, Parsel!</h1>
            <ul>
                <li><a href="http://example.com">Link 1</a></li>
                <li><a href="http://scrapy.org">Link 2</a></li>
            </ul>
        </body>
        </html>""")

In [None]:
sel.css('h1::text').get()


### Beautiful Soup

In [None]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

In [None]:
soup.p

In [None]:
soup.p['class']

In [None]:
soup.a

In [None]:
soup.find_all('a')

In [None]:
soup.find(id="link3")

In [None]:
[ link.get('href') for link in soup.find_all('a')]

In [None]:
soup.a

In [None]:
soup.a.find_next_sibling("a")

In [None]:
soup.p

In [None]:
soup.p.find_next_sibling("p")

In [None]:
pn=soup.p.find_next_sibling("p")
children = pn.children

In [None]:
children

In [None]:
lista = [ x for x in children ]
lista

In [None]:
lista[1].get('href')

In [None]:
head_tag = soup.head
head_tag

In [None]:
for child in head_tag.children:
    print(child)

In [None]:
for child in head_tag.descendants:
    print(child)

In [None]:
last_a_tag = soup.find("a", id="link3")
last_a_tag


In [None]:
last_a_tag.next_sibling

In [None]:
last_a_tag.next_element

In [None]:
last_a_tag.parent

In [None]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

soup.find_all(has_class_but_no_id)

In [None]:
soup.find_all(id='link2')

In [None]:
soup.find_all("a", class_="sister")

In [None]:
soup.find_all("a")
soup("a")

---
## Zadanie 1
Wypisać tytuły ogłoszeń z:
https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/v1c9073p1

---
## Zadanie 2
Wypisać adresy www ogłoszeń z:
https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/v1c9073p1

---