In [None]:
# importing the request library for web scrapping purposes
import requests

In [None]:
# sending an HTTP request, get the response and save in a variable

response = requests.get("http://www.epicurious.com/search/Tofu+Chili")
# Checking the response status code to see if everything went as planned
# status code 200: the request response cycle was succesful
# any other status code: it didn't woke (e.g., 404 = page not found)

print(response.status_code)

# Get the content of the response
# converting to utf-8 if necessary

response.content.decode('utf-8')

In [None]:
# Try Problem: Get the contents of Wikipedia's main page and look for the string "Did you know" in it

url = "http://en.wikipedia.org/wiki/main_page"

# word to search

import requests

my_response = requests.get(url)
print(my_response)

my_response1 = my_response.content.decode('utf-8')
print(my_response1)

In [None]:
# checking the status
my_response.status_code

# now the word to find

my_response1.find('Did you know')

In [None]:
# How to extract data from a string using JSON (JavaScript Object Notation) standard
# Importing JSON library

import json

data_string = '[{"b":[2, 4], "c": 3.0, "a": "A"}]' # json string
python_data = json.loads(data_string) # converts to a python dictionary list
print(python_data)

print (data_string)

In [None]:
# json.loads recursively decodes a string in JSON format into equivalent python object

# data_string's outermost element is converted into a python list
# the first element of that list is converted into a dictionary
# the key of that dictionary is converted into a string
# the value of that dictionary is converted into a list of two integer elements

print(type(data_string), type(python_data))
print(type(python_data[0]), python_data[0])
print(type(python_data[0]['b']), python_data[0]['b'])
print(python_data[0]['a'])

In [None]:
# writing the correct format when reading a JSON file

# wrong format
json.loads("Hello") # this returns a exception 

# correct format
json.loads ('"Hello"')

In [None]:
# REQUESTS LIBRARY AND JSON
# calling a json command automatically to our json file from the internet with asking for decodes, status etc
# always check for exceptions

address = "Columbia University, New York, NY"
url = "https://maps.googleapis.com/maps/api/geocode/json?address=%s" % (address) # %s concatenate the address to the url
my_response = requests.get(url).json()
print(type(my_response))

In [None]:
# to check for an exception

address = "Columbia University, New York, NY"
url = "https://maps.googleapis.com/maps/api/geocode/json?address=%s" % (address) # %s concatenate the address to the url
try:
    my_response = requests.get(url)
    if not my_response.status_code == 200:
        print("HTTP error", my_response.status_code)
    else:
        try:
            my_response_data = my_response.json()
        except:
            print("Response not in valid JSON format")
except:
    print("Something went wrong with requests.get")
print(type(my_response_data))

In [None]:
print(url)

In [None]:
# XML (Extensible Markup Language). This is displayed in form of a family tree
data_string = """
<Bookstore>
   <Book ISBN="ISBN-13:978-1599620787" Price="15.23" Weight="1.5">
      <Title>New York Deco</Title>
      <Authors>
         <Author Residence="New York City">
            <First_Name>Richard</First_Name>
            <Last_Name>Berenholtz</Last_Name>
         </Author>
      </Authors>
   </Book>
   <Book ISBN="ISBN-13:978-1579128562" Price="15.80">
      <Remark>
      Five Hundred Buildings of New York and over one million other books are available for Amazon Kindle.
      </Remark>
      <Title>Five Hundred Buildings of New York</Title>
      <Authors>
         <Author Residence="Beijing">
            <First_Name>Bill</First_Name>
            <Last_Name>Harris</Last_Name>
         </Author>
         <Author Residence="New York City">
            <First_Name>Jorg</First_Name>
            <Last_Name>Brockmann</Last_Name>
         </Author>
      </Authors>
   </Book>
</Bookstore>
"""

In [None]:
# iterating over an XML tree
# use an iterator
# the iterator will generate every tree element in a given subtree

for element in root.iter():
    print(element)

In [None]:
# Another method is to use the child in the subtree construction

for child in root:
    print(child)

In [None]:
# Accessing the tag

for child in root:
    print(child.tag)

In [None]:
# Using the iterator to get specific tags
    # In the example below, only the author tags are accessed
    # For each author tag, the find function access the First_name and Last_name tags
    # The find function only looks at the children, not other descendants
    # the .text attribute prints the text in a leaf node
    
for element in root.iter("Author"):
    print(element.find('First_Name').text, element.find('Last_Name').text)

In [None]:
# Problem: Print the first and last names of all authors in the tree "root" using xpath and findall function

for element in root.findall("Book/Authors/Author/Last_Name"):
    print(element.text)

In [None]:
#lxml: searching by attribute value
    # Using values of attributes s filters
    # Example: Find the first name of the author of a book that weighs 1.5kg

root.find('Book[@Weight = "1.5"]/Authors/Author/First_Name').text

In [None]:
# Problem 3: Print the first and last name of all authors who live in New York City

for firstname, lastname in zip (root.findall("Book/Authors/Author[@Residence = 'New York City']/First_Name"),
                                root.findall("Book/Authors/Author[@Residence = 'New York City']/Last_Name")):
    print(firstname.text + ' ' + lastname.text)