In [5]:
# Binary Data Formats

# One of the easiest ways to store data efficiently in binary format is using Python’s built
# in "pickle" serialization. Conveniently, pandas objects all have a save method which
# writes the data to disk as a pickle

import pandas as pd
import numpy as np

frame=pd.read_csv('ch06ex1.csv')
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
frame.save('ch06frame_pickle') #i did not create the ch06frame file

#You read the data back into Python with pandas.load, another pickle convenience function

frame.load('ch06frame_pickle')

# o/p:
#        a   b   c   d message
#  0  1   2   3   4   hello
#  1  5   6   7   8   world
#  2  9  10  11  12     foo

AttributeError: 'DataFrame' object has no attribute 'save'

In [None]:
# Using HDF5 Format

# There are a number of tools that facilitate efficiently reading and writing large amounts
# of scientific data in binary format on disk. A popular industry-grade library for this is
# “HDF” in HDF5 stands for hierarchical data format

# pandas has a minimal dict-like HDFStore class, which uses PyTables to store pandas
# objects

store=pd.HDFStore('mydata.h5')
store['obj1']=frame
store['obj1_col']=frame['a']
store

In [None]:
store['obj1']

In [None]:
store['obj1_col']

# HDF5 is not a database. It is best suited for write-once, read-many da
# tasets. While data can be added to a file at any time, if multiple writers
# do so simultaneously, the file can become corrupted.

In [None]:
# Reading Microsoft Excel Files

# pandas also supports reading tabular data stored in Excel 2003 (and higher) files using
# the ExcelFile class. Interally ExcelFile uses the xlrd and openpyxl packages, so you
# may have to install them first. To use ExcelFile, create an instance by passing a path
# to an xls or xlsx file

xls_file=pd.ExcelFile('data.xlsx') # does not work need to create file

#  Data stored in a sheet can then be read into DataFrame using parse

t

In [None]:
# Interacting with HTML and Web APIs

import requests

url= 'http://search.twitter.com/search.json?q=python%20pandas'
r=requests.get(url)
print(r.text)

#  The Response object’s text attribute contains the content of the GET query.

In [None]:
# Many web APIs will return a JSON string that must be loaded into a Python object

import json
data=json.loads(r.text)

data.keys()

In [7]:
# The results field in the response contains a list of tweets, each of which is represented
#  as a Python dict that looks like:
#  {u'created_at': u'Mon, 25 Jun 2012 17:50:33 +0000',
#  u'from_user': u'wesmckinn',
#  u'from_user_id': 115494880,
#  u'from_user_id_str': u'115494880',
#  u'from_user_name': u'Wes McKinney',
#  u'geo': None,
#  u'id': 217313849177686018,
#  u'id_str': u'217313849177686018',
#  u'iso_language_code': u'pt',
#  u'metadata': {u'result_type': u'recent'},
#  u'source': u'<a href="http://twitter.com/">web</a>',
#  u'text': u'Lunchtime pandas-fu http://t.co/SI70xZZQ #pydata',
#  u'to_user': None,
#  u'to_user_id': 0,
#  u'to_user_id_str': u'0',
#  u'to_user_name': None}

In [8]:
# We can then make a list of the tweet fields of interest then pass the results list to DataFrame

tweet_files=['craeted_at','from_user','id','text']
tweets=pd.DataFrame(data['results'],columns=tweets_files)
print(tweets)

KeyError: 'results'

In [None]:
# Each row in the DataFrame now has the extracted data from each tweet
tweets.ix[7]

In [None]:
#  Interacting with Databases (Just revise fro book)

In [None]:
# Storing and Loading Data in MongoDB (Just revise fro book)