# Data input and output

In [14]:
import pandas as pd

## View data

Command line

In [11]:
# For simple plain text files
!cat ../data/examples/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


## Read data

### Read CSV

#### Default settings

In [12]:
# View
!cat ../data/examples/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [15]:
# Read
df = pd.read_csv("../data/examples/ex1.csv")

In [16]:
# Explore
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### No header

In [17]:
# View, csv but no header
!cat ../data/examples/ex2.csv

3590,1545,3396,9448
2947,5631,0008,0767
1794,8582,3280,3639
1594,6027,6703,1537
8197,0106,8095,9417


In [19]:
# Read
df = pd.read_csv("../data/examples/ex2.csv", header=None)
df

Unnamed: 0,0,1,2,3
0,3590,1545,3396,9448
1,2947,5631,8,767
2,1794,8582,3280,3639
3,1594,6027,6703,1537
4,8197,106,8095,9417


In [22]:
# Read with customized header
col_names = ["a", "b", "c", "d", "message"]
df = pd.read_csv("../data/examples/ex2.csv", names=col_names)
df

Unnamed: 0,a,b,c,d
0,3590,1545,3396,9448
1,2947,5631,8,767
2,1794,8582,3280,3639
3,1594,6027,6703,1537
4,8197,106,8095,9417


#### With index

In [26]:
# Read and specify index
col_names = ["a", "b", "c", "d"]
index_col = "a"

df = pd.read_csv(
    "../data/examples/ex2.csv", 
    names=col_names, 
    index_col=index_col
)

df

Unnamed: 0_level_0,b,c,d
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3590,1545,3396,9448
2947,5631,8,767
1794,8582,3280,3639
1594,6027,6703,1537
8197,106,8095,9417


#### With multi-level index

In [31]:
!cat ../data/examples/csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [33]:
# Read and specify index
index_col = ["key1", "key2"]
df = pd.read_csv("../data/examples/csv_mindex.csv", index_col=index_col)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [12]:
df.shape

(8, 2)

#### White space delim

In [35]:
# .csv but not csv
!cat ../data/examples/ex3.csv

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


In [38]:
df = pd.read_csv("../data/examples/ex3.csv", sep="\s+") # Regex
df

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


#### Skip rows

In [44]:
!cat ../data/examples/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [47]:
skip_rows = [0, 4]
df = pd.read_csv("../data/examples/ex1.csv", skiprows=skip_rows)
df

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


#### Missing values (auto)

In [49]:
!cat ../data/examples/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo


In [51]:
# Auto detect (empty, NA)
df = pd.read_csv("../data/examples/ex5.csv")
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [52]:
df.notnull() # NULL

Unnamed: 0,something,a,b,c,d,message
0,True,True,True,True,True,False
1,True,True,True,False,True,True
2,True,True,True,True,True,True


#### Missing values (manual)

In [54]:
!cat ../data/examples/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo


In [56]:
na_codes = ["NA", "foo"]
df = pd.read_csv("../data/examples/ex5.csv", 
                 na_values=na_codes)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,


#### Missing values (different cols)

In [58]:
!cat ../data/examples/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo


In [60]:
na_codes = {
    "something": ["three"],
    "message": ["NA", "foo"]
}

df = pd.read_csv("../data/examples/ex5.csv", na_values=na_codes)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,,9,10,11.0,12,


### Read JSON

In [62]:
!cat ../data/examples/example.json

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]


In [64]:
df = pd.read_json('../data/examples/example.json')
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


### Read EXCEL

#### Read a sheet

In [67]:
df = pd.read_excel("../data/examples/ex1.xlsx", sheet_name="Sheet1")
df

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [68]:
df.drop("Unnamed: 0", axis=1)

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [51]:
# Drop first columns
df = pd.read_excel("data/examples/ex1.xlsx", sheet_name="Sheet1")\
    .drop("Unnamed: 0", axis=1)
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### Use specific columns

In [70]:
# Example 1
used_cols = [1, 2, 4] # starts from 0

df = pd.read_excel(
    "../data/examples/ex1.xlsx", 
    sheet_name="Sheet1", 
    usecols=used_cols)
df

Unnamed: 0,a,b,d
0,1,2,4
1,5,6,8
2,9,10,12


In [56]:
# Example 2
df = pd.read_excel("data/examples/ex1.xlsx", sheet_name="Sheet1", usecols="B,E:Z")
df

Unnamed: 0,a,d,message
0,1,4,hello
1,5,8,world
2,9,12,foo


### Read SQL

Học sau

### Read in chunks

#### Read first n rows

In [30]:
# Read first 5 rows to have some sense of data
df = pd.read_csv("data/examples/ex6.csv", nrows=5)
df

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


#### Read files in chunks

In [72]:
# Read by chunk
df = pd.read_csv("../data/examples/ex6.csv", chunksize=1000)

In [75]:
# A series to count appearences of key
key_counts = pd.Series([])

  


In [76]:
# Iterate through every chunk and combine the counts
for chunk in df:
    temp = chunk["key"].value_counts()
    key_counts = key_counts.add(temp, fill_value=0)

In [77]:
# View the results
print(dict(key_counts))

{'0': 151.0, '1': 146.0, '2': 152.0, '3': 162.0, '4': 171.0, '5': 157.0, '6': 166.0, '7': 164.0, '8': 162.0, '9': 150.0, 'A': 320.0, 'B': 302.0, 'C': 286.0, 'D': 320.0, 'E': 368.0, 'F': 335.0, 'G': 308.0, 'H': 330.0, 'I': 327.0, 'J': 337.0, 'K': 334.0, 'L': 346.0, 'M': 338.0, 'N': 306.0, 'O': 343.0, 'P': 324.0, 'Q': 340.0, 'R': 318.0, 'S': 308.0, 'T': 304.0, 'U': 326.0, 'V': 328.0, 'W': 305.0, 'X': 364.0, 'Y': 314.0, 'Z': 288.0}


## Write data

## Write data

### To console

In [79]:
# Read
df = pd.read_csv("../data/examples/ex5.csv")
df['a'] *= 1.1
df

Unnamed: 0,something,a,b,c,d,message
0,one,1.1,2,3.0,4,
1,two,5.5,6,,8,world
2,three,9.9,10,11.0,12,foo


#### To standard output

In [80]:
import sys

df.to_csv(sys.stdout)

,something,a,b,c,d,message
0,one,1.1,2,3.0,4,
1,two,5.5,6,,8,world
2,three,9.9,10,11.0,12,foo


#### Without index

In [81]:
df.to_csv(sys.stdout, index=False)

something,a,b,c,d,message
one,1.1,2,3.0,4,
two,5.5,6,,8,world
three,9.9,10,11.0,12,foo


#### Without header

In [82]:
df.to_csv(sys.stdout, header=None)

0,one,1.1,2,3.0,4,
1,two,5.5,6,,8,world
2,three,9.9,10,11.0,12,foo


#### With customized columns

In [39]:
df.to_csv(sys.stdout, index=False, columns=["a", "b", "c", "d"])

a,b,c,d
1,2,3.0,4
5,6,,8
9,10,11.0,12


#### With customized separator

In [84]:
df.to_csv(sys.stdout, index=False, sep=";")

something;a;b;c;d;message
one;1.1;2;3.0;4;
two;5.5;6;;8;world
three;9.9;10;11.0;12;foo


### To CSV

In [86]:
# Read
df = pd.read_csv("../data/examples/ex5.csv")
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [87]:
# Write
df.to_csv("out.csv", index=False, na_rep=".")

In [84]:
# Check
pd.read_csv("data/out.csv")

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,.
1,two,5,6,.,8,world
2,three,9,10,11.0,12,foo


### To EXCEL

In [88]:
# Read
df = pd.read_csv("../data/examples/ex5.csv")
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [90]:
# To excel, with customized NA value and sheet name
df.to_excel("out.xlsx", index=False, na_rep="N/A", sheet_name="first_sheet")

In [46]:
# Check
pd.read_excel("data/out.xlsx")

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3,4,NON_VALUE
1,two,5,6,NON_VALUE,8,world
2,three,9,10,11,12,foo


In [88]:
df = pd.read_excel("data/out.xlsx", sheet_name="first_sheet")
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3,4,NON_VALUE
1,two,5,6,NON_VALUE,8,world
2,three,9,10,11,12,foo


In [89]:
df.to_excel("data/out.xlsx", sheet_name="hello", index=False)

In [107]:
writer = pd.ExcelWriter("data/master.xlsx", engine="xlsxwriter")
df.to_excel(writer, sheet_name="sheet_1")
df.to_excel(writer, sheet_name="sheet_2")
df.to_excel(writer, sheet_name="sheet_3")
writer.save()