# Data Loading, Storage, 

In [4]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

## Reading and Writing Data in Text Format

pg 167 has a list of different files types that can be read.

## Mechanics for processing data

- Indexing: can treat one or more columns as the returned dataframe, and whether to get column names from the file, the user, or not at all
- Type inference and data conversion: includes user-defined value conversions and custom list of missing value markers
- Datetime parsing: can combine data from multiple colums in to one (like dates spread in to DD-MM-YYYY columns.
- Iterating: iterating over chunks of very large files
- Unclean data issues: skipping rows or a footer, comments, or other things like numbers with commas in them (1,000,000)


In [8]:
# we can preview a file using !cat
!cat examples/ex1.csv

#the below is clearly delimited by commas.

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

Reading csv files has become quite complicated over the years... or it can be. .read_csv has over 50 parameters that can be used for importing its data to a pd DataFrame. Most may never see use, but it's helpful to know how to find them.

read_csv also has a feature called type inference, it will import columns as the direct data types (provided there are no outlying values to force the column to be objects... like "no value" or "anyotherword" instead of NaN in a column of numbers.

In [9]:
df = pd.read_csv('examples/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [10]:
# using read_table also lets us set the delimiter. It won't always be a ','. It may sometimes be a ; or some other 
# character.

pd.read_table('examples/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [11]:
# not all files have header rows

!cat examples/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [12]:
# if notice that in our first glance, we can read csv and tell it not to set a header.

pd.read_csv('examples/ex2.csv', header=None)

# or we can set the header column names by setting the 'names' parameter = []
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [13]:
names = ['a', 'b', 'c', 'd', 'message']
# maybe we're using one column as our index. Great when the columns have names, no chance of confusing the column
# for an index #.

# we can set the index column when reading the file
pd.read_csv('examples/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [15]:
# message is now our list of index names!


# sometimes we want hierarchical indexes... say we want to split up individual years as indexes and then 
# show each month as the next index
# 1999. Jan
#       Feb
#       Mar....
# 2000  Jan
#       Feb...

# we can do that by passing a list of columns to be used as the indexes.

!cat examples/csv_mindex.csv
parsed = pd.read_csv('examples/csv_mindex.csv',
                     index_col=['key1', 'key2'])
parsed

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [16]:
# values aren't always delimited by commas, sometimes a mix of things are used like commas and white space.

list(open('examples/ex3.txt'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [17]:
# our friends regular expressions can be of use here. we see there is a variable amount of white space, so let's
# use regex to our advantage when parsing the data.

result = pd.read_table('examples/ex3.txt', sep='\s+')
result

# because there is one fewer column name than columns available, the parser used the first column as indexes.

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [19]:
!cat examples/ex4.csv


# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [20]:
# some of those rows are absolutely ridiculous. let's ignore them.

pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [22]:
# no more lines causing a ruckus. it's the data your parents hope you meet at church.

!cat examples/ex5.csv


# when reading in data, various values can represent null values. The default ones the parser checks for
# are called 'sentinel' values. They include: NA and NULL and others.

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [23]:
result = pd.read_csv('examples/ex5.csv')
result
pd.isnull(result)

# below we can see it catching the blank spot in index 'two' and the NA in the message column.

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [25]:
# our data won't always have things covered by sentinel values. For these situations we can use the
# na_values parameter. na_values=['list', 'of', 'what', 'our' 'data', 'calls', 'null']

result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [29]:
# maybe these values are different by column or there is some weird overlap.
# we can set these na_values as a dict as well. key(column): ['the', 'sentinels']

# looks for 'foo' and 'NA' in the message column and knows to consider both None
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


pg 172-173 has a list of parameters useful for importing data

### Reading Text Files in Pieces

In [30]:
# when reading large data sets we sometimes only want to view a piece of it.
# the below can set the default number of rows to view.

pd.options.display.max_rows = 10

In [32]:
result = pd.read_csv('examples/ex6.csv')
result.describe()

Unnamed: 0,one,two,three,four
count,10000.0,10000.0,10000.0,10000.0
mean,0.04575,0.000871,-0.026463,0.015985
std,0.948825,1.003829,1.037273,0.982409
min,-3.726864,-3.465356,-3.234391,-3.173509
25%,-0.618617,-0.706643,-0.727791,-0.676291
50%,0.041638,0.018972,-0.03234,-0.005338
75%,0.701536,0.708405,0.626904,0.659369
max,2.833891,2.946737,3.053345,3.412734


In [33]:
result = pd.read_csv('examples/ex6.csv')
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [34]:
# we can set the number of rows to view using nrows as well

pd.read_csv('examples/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [37]:
# it can also be useful to read the data in chunks. chunksize helps us do this. 

chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
chunker

<pandas.io.parsers.readers.TextFileReader at 0x1292bd990>

In [38]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)

tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)

  tot = pd.Series([])


In [39]:
tot[:10]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

### Writing Data to Text Format

In [40]:
# we can export data to a delimited format

data = pd.read_csv('examples/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [42]:
# .to_csv writes to a file with comma separated values

data.to_csv('examples/out.csv')
!cat examples/out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [43]:
# we can also manually set a delimiter. sys.stdout just shows us the result.
import sys
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [44]:
# missing values will appear as empty strings. we can change the default
# representation of missing values when writing the file.

# this sets missing values to 'NULL'
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [46]:
# index & columns are auto-set. We can keep the writer from doing this by setting index and/or header = False
# this way it is only the data without the header & indexes.

data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [47]:
# it is also not mandatory to write ALL of the data. perhaps we only want to export columns 'a' 'b' & 'c'

data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [48]:
# writing to_csv.. not just for DataFrames
# we can write series to files, too!

dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/tseries.csv')
!cat examples/tseries.csv

,0
2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


### Working with Delimited Formats

In [49]:
# when files have a single character delimiter we can use the normal read_csv method.
# but life isn't always so simple....

!cat examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"


In [51]:
# Mr Frodo, do you remember the Shire? ;_;

import csv
f = open('examples/ex7.csv')

reader = csv.reader(f)

In [52]:
# iterating through the data like it were a file, line by line, helps remove some of the ''s.

for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [53]:
# read the file as a list of lines

with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))

In [56]:
# let's save where the header & values are. [0] is the header, [1:] 1 to the end is the rest of our rows.

header, values = lines[0], lines[1:]

In [57]:
# oh hi neighbor list comprehension! what do you have for us today?

data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [60]:
# sometimes we want to set all these values ahead of time for repeated use.
# We can set a subclass of csv.Dialect to store our preferred lineterminator, delimiter, quotechar, and quoting.

class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL

In [62]:
# we don't have any file opened as 'f', but it helps to see this written as code.

reader = csv.reader(f, dialect=my_dialect)

ValueError: I/O operation on closed file.

In [63]:
reader = csv.reader(f, delimiter='|')

ValueError: I/O operation on closed file.

In [64]:
with open('mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

pgs 177-178 have more on csv dialect options.

### JSON Data

In [67]:
# setting up data for JSON format.

obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [68]:
# JavaScript object notation
# one of the standards for sending information via HTTP.
# structurally it is close to Python dicts.

import json
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [69]:
asjson = json.dumps(result)

In [72]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

# keys can become callable column names. here we took the obj data and let the results be siblings, looking at 
# their name's & age

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


In [74]:
# By using pandas.read_json(), data can be converted to series or pd DataFrames.

!cat examples/example.json

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]


In [77]:
# it is assumed that each object in the JSON file is a row.
# more of this will be seen in Ch. 7

data = pd.read_json('examples/example.json')
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [76]:
print(data.to_json())
print(data.to_json(orient='records'))

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


### XML and HTML: Web Scraping

In [108]:
conda install -c anaconda lxml
# pip install beautifulsoup4 html5lib

SyntaxError: invalid syntax (3883926208.py, line 1)

In [80]:
# several libraries are available for reading & writing HTML via Python

import sys
!{sys.executable} -m pip install beautifulsoup4 html5lib 

You should consider upgrading via the '/opt/homebrew/Cellar/jupyterlab/3.4.3/libexec/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [81]:
import sys
!{sys.executable} -m pip install lxml

You should consider upgrading via the '/opt/homebrew/Cellar/jupyterlab/3.4.3/libexec/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [79]:
# when reading html, .read_html tries to parse data between <Table> tags.

tables = pd.read_html('examples/fdic_failed_bank_list.html')
len(tables)
failures = tables[0]
failures.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [82]:
# plenty of banks closed. let's search how many for each year. Because 'Closing Date' was parsed as date time, 
# this is easily doable.

close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
       ... 
2004      4
2001      4
2007      3
2003      3
2000      2
Name: Closing Date, Length: 15, dtype: int64

#### Parsing XML with lxml.objectify

<INDICATOR>
  <INDICATOR_SEQ>373889</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
  <DESCRIPTION>Percent of the time that escalators are operational
  systemwide. The availability rate is based on physical observations performed
  the morning of regular business days only. This is a new indicator the agency
  began reporting in 2009.</DESCRIPTION>
  <PERIOD_YEAR>2011</PERIOD_YEAR>
  <PERIOD_MONTH>12</PERIOD_MONTH>
  <CATEGORY>Service Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>U</DESIRED_CHANGE>
  <INDICATOR_UNIT>%</INDICATOR_UNIT>
  <DECIMAL_PLACES>1</DECIMAL_PLACES>
  <YTD_TARGET>97.00</YTD_TARGET>
  <YTD_ACTUAL></YTD_ACTUAL>
  <MONTHLY_TARGET>97.00</MONTHLY_TARGET>
  <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>

In [84]:
# xml (eXtensible Markup Language); another common structured ata format supporting nested data w/ metadata
# yupparently the book we're working through is a collection of large XML docs.

from lxml import objectify

path = 'datasets/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()

In [87]:
data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']
# for searching through tags. assigns a tag as a key and then its corresponding value.
# we skipped some fields, if those tags are found we skip adding them to our data set.

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

In [88]:
perf = pd.DataFrame(data)
perf.head()

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,1,Service Indicators,M,%,95.0,96.9,95.0,96.9
1,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,2,Service Indicators,M,%,95.0,96.0,95.0,95.0
2,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,3,Service Indicators,M,%,95.0,96.3,95.0,96.9
3,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,4,Service Indicators,M,%,95.0,96.8,95.0,98.3
4,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,5,Service Indicators,M,%,95.0,96.6,95.0,95.8


In [89]:
# we got tags, sure, but each tag may also have metadata.

from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()

In [92]:
# we can now access other html fields. we've parsed the string and can get the root of the tag, the text of the tag
# or the whole thing at once

root

<Element a at 0x12f78fd00>

In [93]:
root.get('href')


'http://www.google.com'

In [94]:
root.text

'Google'

## Binary Data Formats

In [95]:
# pickles are great for storing data in binary
# pickle pickle pickle

# pickle

# all Pandas objects have a to_pickle method.

frame = pd.read_csv('examples/ex1.csv')
frame
frame.to_pickle('examples/frame_pickle')

In [96]:
pd.read_pickle('examples/frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [98]:
!rm examples/frame_pickle

rm: examples/frame_pickle: No such file or directory


### Using HDF5 Format

- Useful for storing large datasets as it does on-the-fly compression.
- Interfaces well with many libraries (such as; Java, Julia, MATLAB, and Python)

In [1]:
conda install lxml

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/nick/opt/anaconda3

  added / updated specs:
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.13.0               |   py39hecd8cb5_0         906 KB
    ------------------------------------------------------------
                                           Total:         906 KB

The following packages will be UPDATED:

  conda                               4.12.0-py39hecd8cb5_0 --> 4.13.0-py39hecd8cb5_0



Downloading and Extracting Packages
conda-4.13.0         | 906 KB    | ##################################### | 100% 
Preparing transaction: done
  environment location: /Users/nick/.conda/environments.txt

done
  environment location: /Users/nick/opt/anaconda3
  registry file: /Users/nick/.conda/environments.txt
done

Note: you may ne

In [19]:
import os

In [2]:
import sys
!{sys.executable} -m pip install tables



In [21]:
frame = pd.DataFrame({'a': np.random.randn(100)})
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [22]:
# objects in HDF5 files can retrieve objects in the same way as other methods. 
store['obj1']

Unnamed: 0,a
0,-0.761837
1,-0.331617
2,-1.751315
3,0.628894
4,0.282502
...,...
95,-0.126072
96,0.398205
97,0.141638
98,-0.264141


In [23]:
store.put('obj2', frame, format='table')
store.select('obj2', where=['index >= 10 and index <= 15'])
store.close()

In [24]:
frame.to_hdf('mydata.h5', 'obj3', format='table')
pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])

ValueError: The file 'mydata.h5' is already opened, but not in read-only mode (as requested).

In [25]:
os.remove('mydata.h5')

### Reading Microsoft Excel Files

In [27]:
# to begin working w/ an excel file we create an instance of the file

xlsx = pd.ExcelFile('examples/ex1.xlsx')

In [28]:
# data can then be read and parsed in to a DataFrame

pd.read_excel(xlsx, 'Sheet1')

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [29]:
frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [None]:
writer = pd.ExcelWriter('examples/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

In [None]:
frame.to_excel('examples/ex2.xlsx')

In [None]:
!rm examples/ex2.xlsx

## Interacting with Web APIs

In [None]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp

In [None]:
data = resp.json()
data[0]['title']

In [None]:
issues = pd.DataFrame(data, columns=['number', 'title',
                                     'labels', 'state'])
issues

## Interacting with Databases

In [None]:
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""
con = sqlite3.connect('mydata.sqlite')
con.execute(query)
con.commit()

In [None]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [None]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

In [None]:
cursor.description
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

In [None]:
import sqlalchemy as sqla
db = sqla.create_engine('sqlite:///mydata.sqlite')
pd.read_sql('select * from test', db)

In [None]:
!rm mydata.sqlite

## Conclusion