# Ch03 Pandas dataframe 創建

# 安裝 pandas
若有語法不了解的地方，可以參考以下連結:
- [API reference](https://pandas.pydata.org/docs/reference/index.html)
- [Pandas Tutorial](https://www.w3schools.com/python/pandas/default.asp)
- [Pandas 教程](https://www.runoob.com/pandas/pandas-tutorial.html)

In [13]:
# 安裝套件
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import numpy as np
import pandas as pd

In [15]:
pd.__version__

'1.5.0'

## 建立 DataFrame 物件
_dataframe: 多個 series 併成_
創法：
* dictionary <br>
_header 為 keys_
* 2D list
* 依定義手創 (series 合體)

In [3]:
# dictionary approach
data = {
    # header: column values
    'item': [1, 2, 3],
    'type': ['apple', 'banana', 'cherry'],
    'price': [0.5, 0.25, 1.0],
    'quantity': [3, 2, 5]
    }
# pd.DataFrame.from_dict(${字典名}) 由字典創建 dataframe
df = pd.DataFrame.from_dict(data)

Unnamed: 0,item,type,price,quantity
0,1,apple,0.5,3
1,2,banana,0.25,2
2,3,cherry,1.0,5


In [17]:
# 新增一個二維 list
# 最常用

list_students = [
    ['Alex', 19],
    ['Bill', 22],
    ['Carl', 14],
    ['Darren', 18]
]

# 建立基本的 dataframe (list 轉 dataFrame)
df = pd.DataFrame(list_students)

# 設定 dataframe 的欄位 (給 key)
df.columns = ['name', 'age']; df


Unnamed: 0,name,age
0,Alex,19
1,Bill,22
2,Carl,14
3,Darren,18


In [4]:
# 使用 dict 來建立 dataframe
# columns base

df = pd.DataFrame({
    'name': ['Sunny', 'Bill', 'Leo', 'Gino'],
    'age': [19, 22, 14, 18]
}); df

Unnamed: 0,name,age
0,Sunny,19
1,Bill,22
2,Leo,14
3,Gino,18


In [5]:
# 使用 list of dict 來建立 dataframe
# Row base

data_dict = [
    {'name': 'Sunny', 'age': 19},
    {'name': 'Bill', 'age': 22},
    {'name': 'Leo', 'age': 14},
    {'name': 'Darren', 'age': 18},
]

df = pd.DataFrame(data_dict); df

Unnamed: 0,name,age
0,Sunny,19
1,Bill,22
2,Leo,14
3,Darren,18


In [10]:
'''建立「2020 ~ 2023」年「臺北、臺中、高雄」某月平均溫度 的 dataframe'''

# 建立欄位
years = range(2020, 2024)

# 臺北、臺中、高雄 某個月的平均溫度 (共享 index)
taipei = pd.Series([20, 21, 19, 21], index = years)
taichung = pd.Series([25, 26, 27, 28], index = years)
kaohsiung = pd.Series([30, 29, 31, 32], index = years)

# 建立 dataframe (axis=0 是上下合併，axis=1 是左右合併)
df = pd.concat([taipei, taichung, kaohsiung], axis = 1) # 沿 axis 1 (x 軸)合併

# 設定欄位 (給 keys)
df.columns = ['taipei', 'taichung', 'kaohsiung']; df


Unnamed: 0,taipei,taichung,kaohsiung
2020,20,25,30
2021,21,26,29
2022,19,27,31
2023,21,28,32


In [11]:
# 同上
# 建立欄位
years = range(2020, 2024)

# 臺北、臺中、高雄 某個月的平均溫度
taipei = pd.Series([20, 21, 19, 21], index=years)
taichung = pd.Series([25, 26, 27, 28], index=years)
kaohsiung = pd.Series([30, 29, 31, 32], index=years)

# 建立 dataframe (axis=0 是上下合併，axis=1 是左右合併)
# Series 並列在一起就是 DataFrame
df = pd.concat([taipei, taichung, kaohsiung], axis=1)

# 設定欄位
df.columns = ['taipei', 'taichung', 'kaohsiung']; df

Unnamed: 0,taipei,taichung,kaohsiung
2020,20,25,30
2021,21,26,29
2022,19,27,31
2023,21,28,32


## Read/Write CSV File

In [18]:
# 將檔案用 url 讀入
df = pd.read_csv('https://raw.githubusercontent.com/zachmayer/kaggleNCAA/refs/heads/master/inst/kaggle_data/RegularSeasonCompactResults.csv')
# 亦可下載至地端，再直接讀取
# df = pd.read_csv('RegularSeasonCompactResults.csv')
# df = pd.read_csv('RegularSeasonCompactResults.csv', header=None) # 可隱藏標頭

# print first five row
print(df.head())

# save dataframe to a csv file
df.to_csv('dfnew.csv', index = False)

# df.to_csv('dfnew.csv', index=False, header=False)
# sep=','

   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT
0    1985      20     1228      81     1328      64    N      0
1    1985      25     1106      77     1354      70    H      0
2    1985      25     1112      63     1223      56    H      0
3    1985      25     1165      70     1432      54    H      0
4    1985      25     1192      86     1447      74    H      0
