In [None]:
#  Reading and Writing Data in Text Format

# pandas features a number of functions for reading tabular data as a DataFrame object.
# Table 6-1 has a summary of all of them, though read_csv and read_table are likely the
# ones you’ll use the most.

In [1]:
# Type inference is one of the more important features of these functions; that means you
# don’t have to specify which columns are numeric, integer, boolean, or string. Handling
# dates and other custom types requires a bit more effort, though

import numpy as np
import pandas as pd

df = pd.read_csv('ch06ex1.csv') 
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [2]:
# We could also have used read_table and specifying the delimiter
pd.read_table('ch06ex1.csv',sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [3]:
pd.read_csv('ch06ex2.csv',header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
#  To read this in, you have a couple of options. You can allow pandas to assign default
#  column names, or you can specify names yourself

pd.read_csv('ch06ex2.csv',names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
# Suppose you wanted the message column to be the index of the returned DataFrame.
#  You can either indicate you want the column at index 4 or named 'message' using the
#  index_col argument

names=['a','b','c','d','message']

pd.read_csv('ch06ex2.csv',names=names,index_col='message')


Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [6]:
#  In the event that you want to form a hierarchical index from multiple columns, just
#  pass a list of column numbers or names:
#  In [855]: !cat ch06/csv_mindex.csv
#  key1,key2,value1,value2
#  one,a,1,2
#  one,b,3,4
#  one,c,5,6
#  one,d,7,8
#  two,a,9,10
#  two,b,11,12
#  two,c,13,14
#  two,d,15,16

In [7]:
parse=pd.read_csv('ch06csv_mindex.csv',index_col=['key1','key2'])
parse

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [8]:
#  In some cases, a table might not have a fixed delimiter, using whitespace or some other
#  pattern to separate fields. In these cases, you can pass a regular expression as a delimiter
#  for read_table.

result=pd.read_csv('ch06ex3.csv',sep='\s+')
result

Unnamed: 0,[',A,B,"C\n',"
0,'aaa,-0.264438,-1.026059,"-0.619500\n',"
1,'bbb,0.927272,0.302904,"-0.032399\n',"
2,'ccc,-0.264273,-0.386314,"-0.217601\n',"
3,'ddd,-0.871858,-0.348382,1.100491\n']


In [9]:
# The parser functions have many additional arguments to help you handle the wide
#  variety of exception file formats that occur (see Table 6-2). For example, you can skip
#  the first, third, and fourth rows of a file with skiprows

print(pd.read_csv('ch06ex4.csv'))

pd.read_csv('ch06ex4.csv',skiprows=[0,2,3])

                                                                      # hey!
 a                                                 b        c   d    message
 # just wanted to make things more difficult fo... NaN      NaN NaN      NaN
 # who reads CSV files with computers               anyway? NaN NaN      NaN
 1                                                 2        3   4      hello
 5                                                 6        7   8      world
 9                                                 10       11  12       foo


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [10]:
result=pd.read_csv('ch06ex5.csv')
result

Unnamed: 0.1,Unnamed: 0,something,a,b,c,d,message
0,0,one,1,2,3.0,4,
1,1,two,5,6,,8,world
2,2,three,9,10,11.0,12,foo


In [11]:
pd.isnull(result)

Unnamed: 0.1,Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,False,True
1,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False


In [12]:
# The na_values option can take either a list or set of strings to consider missing values

result=pd.read_csv('ch06ex5.csv',na_values=['NULL'])
result

Unnamed: 0.1,Unnamed: 0,something,a,b,c,d,message
0,0,one,1,2,3.0,4,
1,1,two,5,6,,8,world
2,2,three,9,10,11.0,12,foo


In [13]:
# Different NA sentinels can be specified for each column in a dict

sentinels={
    'message':['foo','NA'],
    'something':['two']
}
pd.read_csv('ch06ex5.csv',na_values=sentinels)

Unnamed: 0.1,Unnamed: 0,something,a,b,c,d,message
0,0,one,1,2,3.0,4,
1,1,,5,6,,8,world
2,2,three,9,10,11.0,12,


In [14]:
# Table 6-2. read_csv /read_table function arguments

# page: 160 in python for data science book

In [15]:
# Reading Text Files in Pieces

# When processing very large files or figuring out the right set of arguments to correctly
#  process a large file, you may only want to read in a small piece of a file or iterate through
#  smaller chunks of the file.

In [16]:
# ! ! ! ! ! Read the topic in the book -> Reading Text Files in Pieces

In [17]:
#  Writing Data Out to Text Format
# Data can also be exported to delimited format. Let’s consider one of the CSV files read above

data=pd.read_csv('ch06ex5.csv')
data

Unnamed: 0.1,Unnamed: 0,something,a,b,c,d,message
0,0,one,1,2,3.0,4,
1,1,two,5,6,,8,world
2,2,three,9,10,11.0,12,foo


In [18]:
# Using DataFrame’s 'to_csv' method, we can write the data out to a comma-separated file

data.to_csv('ch06ex5.csv')

In [19]:
# Other delimiters can be used, of course (writing to sys.stdout so it just prints the text
#  result)
data.to_csv(sys.stdout,sep='|')

NameError: name 'sys' is not defined

In [None]:
# Missing values appear as empty strings in the output. You might want to denote them
#  by some other sentinel value:
data.to_csv(sys.stdout, na_rep='NULL')

In [None]:
# With no other options specified, both the row and column labels are written. Both of
#  these can be disabled

data.to_csv(sys.stdout, index=False, header=False)

In [None]:
#  You can also write only a subset of the columns
data.to_csv(sys.stdout,index=False,cols=['a', 'b', 'c'])

In [20]:
# Series also has a to_csv method
from pandas import Series,DataFrame
dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)
ts

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int32

In [21]:
# ts.to_csv('ch06/tseries.csv')
#  2000-01-01 00:00:00,0
#  2000-01-02 00:00:00,1
#  2000-01-03 00:00:00,2
#  2000-01-04 00:00:00,3
#  2000-01-05 00:00:00,4
#  2000-01-06 00:00:00,5
#  2000-01-07 00:00:00,6

In [22]:
# With a bit of wrangling (no header, first column as index), you can read a CSV version
#  of a Series with read_csv, but there is also a from_csv convenience method that makes
#  it a bit simpler

Series.from_csv('ch06/tseries.csv', parse_dates=True)

AttributeError: type object 'Series' has no attribute 'from_csv'

In [23]:
# Manually Working with Delimited Formats

#  Most forms of tabular data can be loaded from disk using functions like pan
#  das.read_table. In some cases, however, some manual processing may be necessary.
#  It’s not uncommon to receive a file with one or more malformed lines that trip up 
# read_table. To illustrate the basic tools, consider a small CSV file:

In [24]:
import csv
f=open('ch06ex7.csv')
reader=csv.reader(f)

for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']


In [25]:
#  From there, it’s up to you to do the wrangling necessary to put the data in the form
#  that you need it. For example

lines=list(csv.reader(open('ch06ex7.csv')))

header,values=lines[0],lines[1:]

data_dict={h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [26]:
# CSV files come in many different flavors. Defining a new format with a different de
# limiter, string quoting convention, or line terminator is done by defining a simple sub
# class of csv.Dialect

class my_dialect(csv.Dialect):
    lineterminator='\n'
    delimiter=';'
    quotechar='""'
    
reader = csv.reader(f, dialect=my_dialect)

#  Individual CSV dialect parameters can also be given as keywords to csv.reader without
#  having to define a subclass

reader = csv.reader(f, delimiter='|')

#  The possible options (attributes of csv.Dialect) and what they do can be found in
#  Table 6-3. in page:164

TypeError: "quotechar" must be a 1-character string

In [27]:
# To write delimited files manually, you can use csv.writer. It accepts an open, writable
#  file object and the same dialect and format options as csv.reader

with open('mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

TypeError: "quotechar" must be a 1-character string

In [28]:
#  JSON Data

obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
            {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""

In [29]:
# There are several Python libraries for reading and
#  writing JSON data. I’ll use 'json' here as it is built into the Python standard library. To
#  convert a JSON string to Python form, use json.loads

import json

result=json.loads(obj)
result


{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

In [30]:
# json.dumps on the other hand converts a Python object back to JSON

asjson=json.dumps(result)
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}]}'

In [32]:
siblings=DataFrame(result['siblings'],columns=['name','age'])
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33
