# Chapter6 資料載入、儲存和檔案格式

## Data Loading, Storage & File Formats Part I

### !cat：mac電腦用來開啟檔案的unix shell命令。

### !type: 如果是windows的電腦，可使用itype的開啟檔案。

In [1]:
!cat examples/example_1.csv









two,d,15,16

In [2]:
path='examples/example_1.csv'

In [3]:
f = open(path)

In [4]:
import pandas as pd

In [5]:
import numpy as np

### pd.read_csv: 可以把資料寫入dataframe

In [6]:
df= pd.read_csv('examples/example_1.csv')

In [7]:
df

Unnamed: 0,key1,key2,value1,value2
0,one,a,1,2
1,one,b,3,4
2,one,c,5,6
3,one,d,7,8
4,two,a,9,10
5,two,b,11,12
6,two,c,13,14
7,two,d,15,16


### pd.read_table: 可以指定分隔資料的為哪個符號。

In [8]:
df2=pd.read_table('examples/example_1.csv', sep=',')

In [9]:
df2

Unnamed: 0,key1,key2,value1,value2
0,one,a,1,2
1,one,b,3,4
2,one,c,5,6
3,one,d,7,8
4,two,a,9,10
5,two,b,11,12
6,two,c,13,14
7,two,d,15,16


### 對沒有標題的檔案，可以透過read_csv自定義標題。

In [10]:
!cat examples/example_2.csv









two,d,15,16,14

### header=None: 寫出來，系統就不會擷取第一行作為標題。

In [11]:
pd.read_csv('examples/example_2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,key1,key2,value1,value2,value3
1,one,a,1,2,4
2,one,b,3,4,8
3,one,c,5,6,9
4,one,d,7,8,10
5,two,a,9,10,11
6,two,b,11,12,12
7,two,c,13,14,13
8,two,d,15,16,14


### names=[]，自己定義標題為何。

In [12]:
pd.read_csv('examples/example_2.csv', names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,key1,key2,value1,value2,value3
1,one,a,1,2,4
2,one,b,3,4,8
3,one,c,5,6,9
4,one,d,7,8,10
5,two,a,9,10,11
6,two,b,11,12,12
7,two,c,13,14,13
8,two,d,15,16,14


In [13]:
names=['a','b','c','d','message']

### index_col: 可以指定某一欄位為index。

In [14]:
pd.read_csv('examples/example_2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
value3,key1,key2,value1,value2
4,one,a,1,2
8,one,b,3,4
9,one,c,5,6
10,one,d,7,8
11,two,a,9,10
12,two,b,11,12
13,two,c,13,14
14,two,d,15,16


In [15]:
!cat 'examples/example_3.csv'









two,d,15,16

### 如果希望多個欄位可以成為index，可以使用[x,y]

In [16]:
parsed= pd.read_csv('examples/example_3.csv',index_col=['key1','key2'])

In [17]:
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


### 用list(open(path))開啟txt資料。

In [18]:
list(open('examples/example_4.csv'))

['\ufeff,A,B,C,D\\n\n',
 'aaa,-20,50,-90,12\\n\n',
 'bbb,40,-100,180,24\\n\n',
 'ccc,-60,150,-270,36\\n\n',
 'ddd,80,-200,360,48\\n']

In [19]:
!cat desktop/python_practice/example_5.csv

cat: desktop/python_practice/example_5.csv: No such file or directory


### skiprows: 可以用來跳過特定行。

In [20]:
pd.read_csv('examples/example_5.csv', skiprows=[0,2,4])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,good
1,9,10,11,12,happy
2,13,14,15,16,mood


### 處理遺失值

### 遇到NA,空格，python會自動補上NAN。 

In [21]:
!cat desktop/python_practice/example_6.csv

cat: desktop/python_practice/example_6.csv: No such file or directory


In [22]:
result= pd.read_csv('examples/example_6.csv')

In [23]:
result

Unnamed: 0.1,Unnamed: 0,one,a,b,c,d,message
0,0,two,1,2,3.0,4,
1,1,three,5,6,,8,Monday
2,2,four,9,10,11.0,12,Tuesday
3,3,five,13,14,15.0,16,Wednesday


In [24]:
pd.isnull(result)

Unnamed: 0.1,Unnamed: 0,one,a,b,c,d,message
0,False,False,False,False,False,False,True
1,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False


### na_values=[] 可以自定義遺失值。

In [25]:
result= pd.read_csv('examples/example_6.csv',na_values=['two',5,10,15])

In [26]:
result

Unnamed: 0.1,Unnamed: 0,one,a,b,c,d,message
0,0,,1.0,2.0,3.0,4,
1,1,three,,6.0,,8,Monday
2,2,four,9.0,,11.0,12,Tuesday
3,3,five,13.0,14.0,,16,Wednesday


### na_values 除了放入list外，也可以放入dict。

In [27]:
sentinels={'message':['Tuesday','NA'], 'one':['two']}

In [28]:
pd.read_csv('examples/example_6.csv', na_values=sentinels)

Unnamed: 0.1,Unnamed: 0,one,a,b,c,d,message
0,0,,1,2,3.0,4,
1,1,three,5,6,,8,Monday
2,2,four,9,10,11.0,12,
3,3,five,13,14,15.0,16,Wednesday


### 分段讀取文字檔

### 在閱讀較大的檔案時，可以先縮小閱讀範圍。

In [29]:
pd.options.display.max_rows=8

In [30]:
result=pd.read_csv('examples/example_7.csv')

In [31]:
result

Unnamed: 0,one,two,three,four,key
0,1,27,53,79,A
1,2,28,54,80,B
2,3,29,55,81,C
3,4,30,56,82,D
...,...,...,...,...,...
22,23,49,75,101,W
23,24,50,76,102,X
24,25,51,77,103,Y
25,26,52,78,104,Z


### 如果一開始只決定閱讀前幾行，可以使用nrows。

In [32]:
pd.read_csv('examples/example_7.csv', nrows=7)

Unnamed: 0,one,two,three,four,key
0,1,27,53,79,A
1,2,28,54,80,B
2,3,29,55,81,C
3,4,30,56,82,D
4,5,31,57,83,E
5,6,32,58,84,F
6,7,33,59,85,G


### chunksize: 設定一次讀幾行。

In [33]:
chunker= pd.read_csv('examples/example_7.csv', chunksize=20)

In [34]:
chunker

<pandas.io.parsers.readers.TextFileReader at 0x7fe994a88580>

In [35]:
tot= pd.Series([])

for piece in chunker:
    tot= tot.add(piece['key'].value_counts(), fill_value=0)

tot= tot.sort_values(ascending=False)

  tot= pd.Series([])


In [36]:
tot[:8]

A    1.0
B    1.0
Y    1.0
X    1.0
W    1.0
V    1.0
U    1.0
T    1.0
dtype: float64

### 寫出文字資料

### to_csv: 可以將資料寫成逗號分隔的檔案。

In [37]:
data=pd.read_csv('examples/example_6.csv')

In [38]:
data

Unnamed: 0.1,Unnamed: 0,one,a,b,c,d,message
0,0,two,1,2,3.0,4,
1,1,three,5,6,,8,Monday
2,2,four,9,10,11.0,12,Tuesday
3,3,five,13,14,15.0,16,Wednesday


In [39]:
data.to_csv('examples/example_out.csv') #給予新的檔案名稱，會出現新檔案在檔案夾內。

In [40]:
!cat desktop/python_practice/example_out.csv

cat: desktop/python_practice/example_out.csv: No such file or directory


### 也可以改用其他的分隔符號。

### sys.stdout 會把文字顯示在終端機上。

In [41]:
import sys

In [42]:
data.to_csv(sys.stdout, sep='|')

|Unnamed: 0|one|a|b|c|d|message
0|0|two|1|2|3.0|4|
1|1|three|5|6||8|Monday
2|2|four|9|10|11.0|12|Tuesday
3|3|five|13|14|15.0|16|Wednesday


### na_rep='' 遺失值可以用其他符號取代。

In [43]:
data.to_csv(sys.stdout, na_rep='Null')

,Unnamed: 0,one,a,b,c,d,message
0,0,two,1,2,3.0,4,Null
1,1,three,5,6,Null,8,Monday
2,2,four,9,10,11.0,12,Tuesday
3,3,five,13,14,15.0,16,Wednesday


### 不要顯示欄標籤與列標籤

In [44]:
data.to_csv(sys.stdout, index=False, header=False)

0,two,1,2,3.0,4,
1,three,5,6,,8,Monday
2,four,9,10,11.0,12,Tuesday
3,five,13,14,15.0,16,Wednesday


### 也可以只顯示部份欄，欄的順序由你決定

In [45]:
data.to_csv(sys.stdout, index=False, columns=['a','b','c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0
13,14,15.0


### Series也可以儲存為csv。

In [46]:
dates= pd.date_range('10/1/2022', periods=7)

In [47]:
ts= pd.Series(np.arange(7), index=dates)

In [48]:
ts.to_csv('examples/example_series.csv')

In [49]:
!cat desktop/python_practice/example_series.csv

cat: desktop/python_practice/example_series.csv: No such file or directory


### 分隔符號的使用

In [50]:
!cat desktop/python_practice/example_8.csv

cat: desktop/python_practice/example_8.csv: No such file or directory


### open(): 用來打開文件，reader():用來讀取文件

In [51]:
import csv

In [52]:
f=open('examples/example_8.csv')

In [53]:
reader=csv.reader(f)

### print(line): 可以顯示每一行的資料。

In [54]:
for line in reader:
    print(line)

['\ufeffa', 'b', 'c', 'd']
['e', 'f', 'g', 'h']
['i', 'j', 'k', 'l']
['m', 'n', 'o', 'p']


### 將資料轉換爲想要的格式list

In [55]:
with open ('examples/example_8.csv') as f:
    lines=list(csv.reader(f))

### 第0列的資料為header，第1列之後的資料為列。

In [56]:
header, values= lines[0], lines[1:]

In [57]:
data_dict={h:v for h, v in zip(header, zip(*values))}

In [58]:
data_dict

{'\ufeffa': ('e', 'i', 'm'),
 'b': ('f', 'j', 'n'),
 'c': ('g', 'k', 'o'),
 'd': ('h', 'l', 'p')}

In [59]:
f=open('examples/example_8.csv')

In [60]:
reader=csv.reader(f)

In [61]:
for line in reader:
    print(line)

['\ufeffa', 'b', 'c', 'd']
['e', 'f', 'g', 'h']
['i', 'j', 'k', 'l']
['m', 'n', 'o', 'p']


In [62]:
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
    
reader=csv.reader(f, dialect=my_dialect)

In [63]:
reader=csv.reader(f,delimiter='|')

In [64]:
for line in reader:
    print(line)

In [65]:
with open('mydata.csv', 'w') as f:
    writer= csv.writer(f, dialect=my_dialect)
    writer.writerow(('one','two','three'))
    writer.writerow(('1','2','3'))
    writer.writerow(('4','5','6'))
    writer.writerow(('7','8','9'))

In [66]:
for line in reader:
    print(line)

In [67]:
!cat desktop/python_practice/example_8.csv

cat: desktop/python_practice/example_8.csv: No such file or directory
