# Requests in Python 
## Refer to https://requests.readthedocs.io

In [None]:
import requests

In [None]:
r = requests.get('http://info.cern.ch/hypertext/WWW/TheProject.html')
r

In [None]:
r.url

In [None]:
r.headers

In [None]:
r.cookies

In [None]:
#r.content #(same in bytes)
r.text 

# Parsing with BeautifulSoup
### https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(r.text,'html.parser')

In [None]:
soup.title

In [None]:
print(soup.prettify())

In [None]:
#soup.a           #same thing
soup.find('a')

In [None]:
soup.find_all('a')

### Extract all the text 

In [None]:
soup.get_text()

### Extract all the URLs

In [None]:
for link in soup.find_all('a'):
    print(link.get('href'))

### Each BeautifulSoup object itself has tree-structure

In [None]:
for child in link.children:
    print(child)

In [None]:
for parent in link.parents:
    print(parent.name)

### Attributes

In [None]:
tag = soup.find_all('a')[2]

In [None]:
tag.attrs

In [None]:
tag.name

In [None]:
#tag.get('name')
tag['name'] 

# Parsing with lxml
### https://lxml.de/tutorial.html
<h3>
    
* Python’s html.parser	`BeautifulSoup(markup, "html.parser")`
    * Batteries included
    * Decent speed
    * Not as fast as lxml

    
    
* lxml’s HTML parser	`BeautifulSoup(markup, "lxml")`
    * Very fast
    * External C dependency


# Advanced use of requests

### Pass parameters along with your GET requests. They take the form of query strings


In [None]:
payload = {'user_name': 'admin', 'password': 'password'}
r = requests.get('http://httpbin.org/get', params=payload)

print(r.url)
print(r.text)

## Making POST Requests
### HTTP POST requests are opposite of the GET requests as it is meant for sending data to a server as opposed to retrieving it. Although, POST requests can also receive data within the response, just like GET requests.

In [None]:
import requests

payload = {'user_name': 'admin', 'password': 'password'}
r = requests.post("http://httpbin.org/post", data=payload)
print(r.url)
print(r.text)

## Handling Redirections
### Redirection in HTTP means forwarding the network request to a different URL. For example, if we make a request to "http://www.github.com", it will redirect to "https://github.com" using a 301 redirect.

In [None]:
import requests

r = requests.post("http://www.github.com")
print(r.url)
print(r.history)
print(r.status_code)

### The redirection process is automatically handled by requests, so you don't need to deal with it yourself. The history property contains the list of all response objects created to complete the redirection. In our example, two Response objects were created with the 301 response code. HTTP 301 and 302 responses are used for permanent and temporary redirection, respectively.

### If you don't want the Requests library to automatically follow redirects, then you can disable it by passing the allow_redirects=False parameter along with the request.

## Using cookies
### Cookies are returned in a RequestsCookieJar, which acts like a dict but also offers a more complete interface, suitable for use over multiple domains or paths

In [None]:
url = 'https://httpbin.org/cookies'
cookies = dict(cookies_are='working')

r = requests.get(url, cookies=cookies)
print(r.text)

### Cookie jars can also be passed in to requests:

In [None]:
jar = requests.cookies.RequestsCookieJar()
jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')
jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')
url = 'https://httpbin.org/cookies'
r = requests.get(url, cookies=jar)
print(r.text)

## Timeouts
### You can tell Requests to stop waiting for a response after a given number of seconds with the timeout parameter. Nearly all production code should use this parameter in nearly all requests. Failure to do so can cause your program to hang indefinitely:

In [None]:
requests.get('https://github.com/', timeout=0.001)

In [None]:
requests.get('https://github.com/', timeout=1)