# Importing flat files from the web

In [2]:
import pandas as pd
from urllib.request import urlretrieve, urlopen, Request
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import tweepy

## The urllib package
* Provides interface for fetching data across the web
* urlopen() - accepts URL's instead of file names

## Importin files from web and saving locally

In [3]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
urlretrieve(url, 'winequality-white.csv')

('winequality-white.csv', <http.client.HTTPMessage at 0x7ff3375feaf0>)

In [4]:
df = pd.read_csv('winequality-white.csv', sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


## Importing files from web without saving locally

In [5]:
url  = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
df_not_saved_locally = pd.read_csv(url, sep= ';')
df_not_saved_locally.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Importing non-flat files from the web

In [6]:
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'
xls = pd.read_excel(url, sheet_name=None)
xls.keys()

dict_keys(['1700', '1900'])

In [7]:
xls['1700'].head()

Unnamed: 0,country,1700
0,Afghanistan,34.565
1,Akrotiri and Dhekelia,34.616667
2,Albania,41.312
3,Algeria,36.72
4,American Samoa,-14.307


# HTTP request to import files from the web

## GET requests using urllib

In [8]:
url = 'https://www.wikipedia.org/'
request = Request(url) # Package GET request using Request function
response = urlopen(request) # send GET request and catch response using urlopen(); returns HTTP response object which has a read method
html = response.read() # apply read() method to response to get html as string
response.close() # close response

## GET requests using Requests package

In [9]:
url = 'https://www.wikipedia.org/'
r = requests.get(url) # package request, send request and catch response with a single function
text = r.text # returns HTML as string by applying .text method

# Scraping the web in Python

## BeautifulSoup

In [10]:
url = 'https://www.crummy.com/software/BeautifulSoup/' # assign URL
r = requests.get(url) # package request to URL, send request and catch response
html_doc = r.text # use text attribute on object to return the HTML of the webpage as a string
soup = BeautifulSoup(html_doc) # create BeautifulSoup object
soup
#pretty_soup = soup.prettify()
#pretty_soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/transitional.dtd">

<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
<link href="mailto:leonardr@segfault.org" rev="made"/>
<link href="/nb/themes/Default/nb.css" rel="stylesheet" type="text/css"/>
<meta content="Beautiful Soup: a library designed for screen-scraping HTML and XML." name="Description"/>
<meta content="Markov Approximation 1.4 (module: leonardr)" name="generator"/>
<meta content="Leonard Richardson" name="author"/>
</head>
<body alink="red" bgcolor="white" link="blue" text="black" vlink="660066">
<style>
#tidelift { }

#tidelift a {
 border: 1px solid #666666;
 margin-left: auto;
 padding: 10px;
 text-decoration: none;
}

#tidelift .cta {
 background: url("tidelift.svg") no-repeat;
 padding-left: 30px;
}
</style>
<img align="right" src="10.1.jpg" width="250"/><br/>
<p>[

## Getting the title from webpage


In [11]:
webpage_title = soup.title
webpage_title

<title>Beautiful Soup: We called him Tortoise because he taught us.</title>

## Getting the text from webpage

In [12]:
webpage_text = soup.get_text()
webpage_text

'\n\n\n\nBeautiful Soup: We called him Tortoise because he taught us.\n\n\n\n\n\n\n\n\n\n[ Download | Documentation | Hall of Fame | For enterprise | Source | Changelog | Discussion group  | Zine ]\n\nBeautiful Soup\n\nYou didn\'t write that awful page. You\'re just trying to get some\ndata out of it. Beautiful Soup is here to help. Since 2004, it\'s been\nsaving programmers hours or days of work on quick-turnaround\nscreen scraping projects.\nBeautiful Soup is a Python library designed for quick turnaround\nprojects like screen-scraping. Three features make it powerful:\n\n\nBeautiful Soup provides a few simple methods and Pythonic idioms\nfor navigating, searching, and modifying a parse tree: a toolkit for\ndissecting a document and extracting what you need. It doesn\'t take\nmuch code to write an application\n\nBeautiful Soup automatically converts incoming documents to\nUnicode and outgoing documents to UTF-8. You don\'t have to think\nabout encodings, unless the document doesn\'t 

## Getting the hyperlinks

In [13]:
a_tags = soup.find_all('a') # passing the HTML tag we want to find to find_all() method
for link in a_tags:
    print(link.get('href')) # print the actual URL's for the hyperlinks

#Download
bs4/doc/
#HallOfFame
enterprise.html
https://code.launchpad.net/beautifulsoup
https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG
https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
zine/
bs4/download/
http://lxml.de/
http://code.google.com/p/html5lib/
bs4/doc/
https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=enterprise
https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
https://bugs.launchpad.net/beautifulsoup/
https://tidelift.com/security
https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=website
zine/
None
bs4/download/
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
download/3.x/BeautifulSoup-3.2.2.tar.gz
https://tidelift.com/subscription/pkg/pypi-beautifulsoup?utm_source=pypi-beautifulsoup&utm_medium=referral&utm_campaign=website
None
http://www.nytimes.co

# Intro to APIs and JSONs

* API: Application Programming Interface
* APIs are used for building and interacting with software applications

## Loading JSONs in Python

In [14]:
with open('/Users/joseservin/DataCamp/Courses/Intermediate_Importing_Data/json_ex.json', 'r') as json_file:
    json_data = json.load(json_file)

In [15]:
type(json_data)

dict

## Exploring JSONs in Python

In [16]:
for key, value in json_data.items():
    print(key + ":" + str(value))

name:John
age:30
car:None


## Connecting to an API in Python

* APIs allow softwares to communicate with each other

### Process

* define URL variable
* Package and send URL query and save to response variable
* use .json() method on response variable to return dictionary
* explore json

In [17]:
url = "http://www.omdbapi.com/?t=Joker&apikey=3628e5f1" #?t=hackers is called the Query string
r = requests.get(url)
json_data = r.json()
for key, value in json_data.items():
    print(key + ":" + value)

Title:Joker
Year:2019
Rated:R
Released:04 Oct 2019
Runtime:122 min
Genre:Crime, Drama, Thriller
Director:Todd Phillips
Writer:Todd Phillips, Scott Silver, Bob Kane
Actors:Joaquin Phoenix, Robert De Niro, Zazie Beetz
Plot:In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mistreated by society. He then embarks on a downward spiral of revolution and bloody crime. This path brings him face-to-face with his alter-ego: the Joker.
Language:English
Country:United States, Canada
Awards:Won 2 Oscars. 122 wins & 240 nominations total
Poster:https://m.media-amazon.com/images/M/MV5BNGVjNWI4ZGUtNzE0MS00YTJmLWE0ZDctN2ZiYTk2YmI3NTYyXkEyXkFqcGdeQXVyMTkxNjUyNQ@@._V1_SX300.jpg


TypeError: can only concatenate str (not "list") to str

## Exploring the Wikipedia API

In [None]:
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'
r = requests.get(url)
json_data = r.json()
for key in json_data.keys():
    print(key)
json_data['query']['pages']['24768']['extract']

# The Twitter API and authentication

* 1640014718-ywUwnpfPENiOZLKjJkjRg1WbYISyIh13Vb6j38f (access token)
* 6iiqXDteEzBbl6LuYgJpQdNnv15KXdxOTcfoWfzY82MpO (access token secret)
* ej89OqxCl3vAYd7OPdCYKI45z (API consumer key)
* Yga9J9ODeXXLmIlMsmS3dqmQCMF3drABdI0nLmWbmfudn6Y9Z7 (API consumer secret)