# Pandas 基礎

## Series の作成

In [128]:
a_list = [7, 8, 9]
s1 = pd.Series(a_list)
print(s1)

0    7
1    8
2    9
dtype: int64


In [3]:
a_list = [7, 8, 9]
s1 = pd.Series(a_list, dtype=float)
print(s1)

0    7.0
1    8.0
2    9.0
dtype: float64


### 文字列をdata型にする
- pandas は文字列を object として扱う

In [7]:
a_arr = ['apple', 'banana', 'orange']
s2 = pd.Series(a_arr)
print(s2)

0     apple
1    banana
2    orange
dtype: object


### index を文字列にする
- index の長さは　dataの長さ(len:数)同じでないといけない

In [9]:
var = [7, 8, 9]
name = ['apple', 'banana', 'orange']
s4 = pd.Series(var, index=name)
print(s4)

apple     7
banana    8
orange    9
dtype: int64


### dictionary 型
key : value が書き込まれている形

key = index / value = 要素

In [10]:
a_dic = {'apple': 7, 'banana':8, 'orange':9}

In [11]:
a_dic['apple']

7

In [12]:
a_dic = {'apple': 7, 'banana':8, 'orange':9}
s5 = pd.Series(a_dic)
print(s5)

apple     7
banana    8
orange    9
dtype: int64


### 欠損値
- dictionary 型の長さに対して index が長い場合は、欠損値が入る
- NaN (not a number)

In [14]:
a_dic2 = {'apple': 7, 'banana':8, 'orange':9}
name = ['apple', 'banana', 'orange', 'lemon']
s6 = pd.Series(a_dic2, index=name)
print(s6)

apple     7.0
banana    8.0
orange    9.0
lemon     NaN
dtype: float64


## Series の参照

In [15]:
s5

apple     7
banana    8
orange    9
dtype: int64

In [16]:
print(s5)

apple     7
banana    8
orange    9
dtype: int64


In [17]:
s5['apple']

7

In [18]:
s5[0]

7

In [20]:
# スライス -> apple ~ orange
s5['apple': 'orange']

apple     7
banana    8
orange    9
dtype: int64

In [21]:
s5['apple'] = 100
print(s5)

apple     100
banana      8
orange      9
dtype: int64


In [22]:
s5[1:3] = 200
print(s5)

apple     100
banana    200
orange    200
dtype: int64


In [23]:
a_list = [7, 8, 9]
b_list = [10, 11]
s7 = pd.Series(a_list)
s8 = pd.Series(b_list)

In [30]:
s7

0    7
1    8
2    9
dtype: int64

In [31]:
s8

0    10
1    11
dtype: int64

In [33]:
s7.append(s8, ignore_index=True)

0     7
1     8
2     9
3    10
4    11
dtype: int64

## Series の要素削除

### drop
drop を用いた削除の場合は、元の要素は削除されない

In [38]:
s5

apple     100
banana    200
orange    200
dtype: int64

In [39]:
s5.drop(index='banana')

apple     100
orange    200
dtype: int64

In [40]:
s5

apple     100
banana    200
orange    200
dtype: int64

### drop 関数は元の Series を削除しないことが default　になっている
- 引数 inplace=True にする

In [41]:
s5.drop(index='banana', inplace=True)

In [42]:
s5

apple     100
orange    200
dtype: int64

# DataFrame

### DataFrame 概要
- index を持つ２つの次元の data 構造のことで、　data を扱う際に頻繁に用いられる
- 2次元というのは,行(row:横)と列(columns:縦)で表現されるモノ(Excelなど…)
- index(行名)、　columns(列名)を指定して data を取得する

In [43]:
# 2次元配列
val = [[1, 2, 3], [4, 5, 6]]

In [45]:
df = pd.DataFrame(val)
print(df)

   0  1  2
0  1  2  3
1  4  5  6


In [46]:
# 引数 dtype で data 内容を変更できる
df = pd.DataFrame(val, dtype=float)
print(df)

     0    1    2
0  1.0  2.0  3.0
1  4.0  5.0  6.0


In [48]:
# 各列の対する data型の確認
df.dtypes

0    float64
1    float64
2    float64
dtype: object

### 行名と列名を指定

In [51]:
# 2次元配列
val = [[1, 2, 3], [4, 5, 6]]
df = pd.DataFrame(val, index=['r0', 'r1'], columns=['c0', 'c1', 'c2'])
print(df)

    c0  c1  c2
r0   1   2   3
r1   4   5   6


## Series を使用した DataFrame

In [58]:
sr_age = pd.Series([23, 31, 49, 60])
sr_gender = pd.Series(['M', 'F', 'F', 'M'])
sr_height = pd.Series([175, 160, 156, 180])
sr_weight = pd.Series([65, 40, 48, 85])

In [59]:
df_info = pd.DataFrame({
    'age': sr_age,
    'gender': sr_gender,
    'height': sr_height,
    'weight': sr_weight
})
print(df_info)

   age gender  height  weight
0   23      M     175      65
1   31      F     160      40
2   49      F     156      48
3   60      M     180      85


### 欠損値

In [62]:
sr_age = pd.Series([23, 31, 49, 60], index=['sato', 'yamada', 'suzuki', 'tanaka'])
sr_gender = pd.Series(['M', 'F', 'F'], index=['sato', 'yamada', 'suzuki'])
sr_height = pd.Series([160, 156, 180], index=['yamada', 'suzuki', 'tanaka'])
sr_weight = pd.Series([65, 85], index=['sato', 'tanaka'])

In [64]:
df_info = pd.DataFrame({
    'age': sr_age,
    'gender': sr_gender,
    'height': sr_height,
    'weight': sr_weight
})
print(df_info)

        age gender  height  weight
sato     23      M     NaN    65.0
suzuki   49      F   156.0     NaN
tanaka   60    NaN   180.0    85.0
yamada   31      F   160.0     NaN


## DataFrame の参照

In [65]:
sr_age = pd.Series([23, 31, 49, 60], index=['sato', 'yamada', 'suzuki', 'tanaka'])
sr_gender = pd.Series(['M', 'F', 'F'], index=['sato', 'yamada', 'suzuki'])
sr_height = pd.Series([160, 156, 180], index=['yamada', 'suzuki', 'tanaka'])
sr_weight = pd.Series([65, 85], index=['sato', 'tanaka'])

df_info = pd.DataFrame({
    'age': sr_age,
    'gender': sr_gender,
    'height': sr_height,
    'weight': sr_weight
})
print(df_info)

        age gender  height  weight
sato     23      M     NaN    65.0
suzuki   49      F   156.0     NaN
tanaka   60    NaN   180.0    85.0
yamada   31      F   160.0     NaN


In [66]:
df_info

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


### tuple型 の　行数と列数

In [68]:
# dataの形状確認
df_info.shape

(4, 4)

In [69]:
print(df_info.shape[0]) # 行数
print(df_info.shape[1]) # 列数

4
4


In [70]:
# 全要素数　4x4
df_info.size

16

In [71]:
df_info

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


### 列指定で要素を選択

In [72]:
df_info['age']

sato      23
suzuki    49
tanaka    60
yamada    31
Name: age, dtype: int64

In [73]:
df_info[['age', 'gender']]

Unnamed: 0,age,gender
sato,23,M
suzuki,49,F
tanaka,60,
yamada,31,F


### loc アトリビュートを使用した data 取得
- 特定の行を参照したい時に使用すると便利

In [74]:
df_info

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


In [75]:
df_info.loc['suzuki']

age          49
gender        F
height    156.0
weight      NaN
Name: suzuki, dtype: object

In [76]:
df_info.loc['suzuki', 'height']

156.0

#### スライスを使用することで、複数要素も取得できる

In [77]:
df_info.loc['suzuki': 'yamada', 'height']

suzuki    156.0
tanaka    180.0
yamada    160.0
Name: height, dtype: float64

In [78]:
df_info.loc['suzuki',:]

age          49
gender        F
height    156.0
weight      NaN
Name: suzuki, dtype: object

### iloc アトリビュート
- label名で指定するのか、行 or 列で指定するのかの違い
- iloc を使用する場合は label名ではなく、**番号指定**になる

In [79]:
df_info

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


In [80]:
df_info.iloc[1, 2]

156.0

In [81]:
# スライスを使用した抽出
df_info.iloc[1:4, 2]

suzuki    156.0
tanaka    180.0
yamada    160.0
Name: height, dtype: float64

In [83]:
# 列 data を取得
df_info.iloc[1, :]

age          49
gender        F
height    156.0
weight      NaN
Name: suzuki, dtype: object

### 条件抽出

In [84]:
df_info

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


In [85]:
# 身長が　１７０以上
df_info[df_info['height'] > 170]

Unnamed: 0,age,gender,height,weight
tanaka,60,,180.0,85.0


In [86]:
# gender が男性を抽出
df_info[df_info['gender'] == 'M']

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0


### head と tail method
- data を読み込んでザッと表示させたい時
- head() : 上から行数を指定
- tail() : 下から行数を指定

In [87]:
df_info.head(2)

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,


In [89]:
df_info.head()

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


In [90]:
df_info.tail(2)

Unnamed: 0,age,gender,height,weight
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


In [91]:
df_info.tail()

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


## DataFrame の要素の変更
- 

In [95]:
df_info

Unnamed: 0,age,gender,height,weight,hegiht
sato,23,M,,65.0,175
suzuki,49,F,156.0,,175
tanaka,60,,180.0,85.0,175
yamada,31,F,160.0,,175


In [96]:
df_info

Unnamed: 0,age,gender,height,weight,hegiht
sato,23,M,,65.0,175
suzuki,49,F,156.0,,175
tanaka,60,,180.0,85.0,175
yamada,31,F,160.0,,175


In [97]:
df_info.drop(columns='hegiht', inplace=True)

In [98]:
df_info

Unnamed: 0,age,gender,height,weight
sato,23,M,,65.0
suzuki,49,F,156.0,
tanaka,60,,180.0,85.0
yamada,31,F,160.0,


In [99]:
df_info['height'] = 175

In [100]:
df_info

Unnamed: 0,age,gender,height,weight
sato,23,M,175,65.0
suzuki,49,F,175,
tanaka,60,,175,85.0
yamada,31,F,175,


In [101]:
df_info.loc['sato', 'weight'] = 100

In [102]:
df_info

Unnamed: 0,age,gender,height,weight
sato,23,M,175,100.0
suzuki,49,F,175,
tanaka,60,,175,85.0
yamada,31,F,175,


In [103]:
df_info.iloc[0] = 30

In [104]:
df_info

Unnamed: 0,age,gender,height,weight
sato,30,30,30,30.0
suzuki,49,F,175,
tanaka,60,,175,85.0
yamada,31,F,175,


In [105]:
df_info.iloc[1, 1] = 'M'

In [106]:
df_info

Unnamed: 0,age,gender,height,weight
sato,30,30,30,30.0
suzuki,49,M,175,
tanaka,60,,175,85.0
yamada,31,F,175,


In [107]:
df_info.iloc[0, 1] = 'M'

In [108]:
df_info

Unnamed: 0,age,gender,height,weight
sato,30,M,30,30.0
suzuki,49,M,175,
tanaka,60,,175,85.0
yamada,31,F,175,


### DataFrame による列と行の追加

In [109]:
df_info

Unnamed: 0,age,gender,height,weight
sato,30,M,30,30.0
suzuki,49,M,175,
tanaka,60,,175,85.0
yamada,31,F,175,


In [110]:
df_info['new_colum'] = 1 # 列の最後に追加　：　１を代入
df_info.loc['new_index'] = 0 # 行の一番最後に追加 : ０を代入　

In [112]:
df_info

Unnamed: 0,age,gender,height,weight,new_colum
sato,30,M,30,30.0,1
suzuki,49,M,175,,1
tanaka,60,,175,85.0,1
yamada,31,F,175,,1
new_index,0,0,0,0.0,0


new_colum の　0 は, new_index に上書きされたので　0 の表示になる

## DataFrame の行と列の削除

In [113]:
df_info

Unnamed: 0,age,gender,height,weight,new_colum
sato,30,M,30,30.0,1
suzuki,49,M,175,,1
tanaka,60,,175,85.0,1
yamada,31,F,175,,1
new_index,0,0,0,0.0,0


In [115]:
df_info.drop(labels='age', axis=1) # axis は列

Unnamed: 0,gender,height,weight,new_colum
sato,M,30,30.0,1
suzuki,M,175,,1
tanaka,,175,85.0,1
yamada,F,175,,1
new_index,0,0,0.0,0


In [116]:
df_info

Unnamed: 0,age,gender,height,weight,new_colum
sato,30,M,30,30.0,1
suzuki,49,M,175,,1
tanaka,60,,175,85.0,1
yamada,31,F,175,,1
new_index,0,0,0,0.0,0


In [117]:
df_info.drop(labels='age', axis=1, inplace=True)

In [119]:
df_info

Unnamed: 0,gender,height,weight,new_colum
sato,M,30,30.0,1
suzuki,M,175,,1
tanaka,,175,85.0,1
yamada,F,175,,1
new_index,0,0,0.0,0


### 複数列の削除

In [121]:
df_info.drop(labels=['height', 'weight'], axis=1, inplace=True)

In [122]:
df_info

Unnamed: 0,gender,new_colum
sato,M,1
suzuki,M,1
tanaka,,1
yamada,F,1
new_index,0,0


In [123]:
df_info.drop(labels='sato', axis=0, inplace=True)

In [124]:
df_info

Unnamed: 0,gender,new_colum
suzuki,M,1
tanaka,,1
yamada,F,1
new_index,0,0


In [126]:
df_info.drop(labels=['new_index', 'yamada'], axis=0, inplace=True)

In [127]:
df_info

Unnamed: 0,gender,new_colum
suzuki,M,1
tanaka,,1
