# 8章 Pandasの基礎

## 8.1 Pandaとは

#### 8.1.2 Series と DataFrame のデータの確認

In [1]:
import pandas as pd

fruits = {
    "orange" : 2, 
    "banana" : 3,
}

print( pd.Series(fruits) )

orange    2
banana    3
dtype: int64


In [2]:
import pandas as pd

data = {
    "fruits" : ["apple", "orange", "banana", "strawberry", "kiwifruit"],
    "year" : [2001, 2002, 2001, 2008, 2006],
    "time" : [1, 4, 5, 6, 3],
}

df = pd.DataFrame(data)

print( df )

       fruits  year  time
0       apple  2001     1
1      orange  2002     4
2      banana  2001     5
3  strawberry  2008     6
4   kiwifruit  2006     3


In [3]:
import pandas as pd

index  = ["apple", "orange", "banana", "strawberry", "kiwifruit"]

data = [10, 5, 8, 12, 3]

#
# Pandas.Series
#
series = pd.Series(data, index=index)

print( series )


#
# Pandas.DataFrame
#
data = {
    "fruits" : ["apple", "orange", "banana", "strawberry", "kiwifruit"],
    "year" : [2001, 2002, 2001, 2008, 2006],
    "time" : [1, 4, 5, 6, 3]
}

df = pd.DataFrame(data)

print( df )

apple         10
orange         5
banana         8
strawberry    12
kiwifruit      3
dtype: int64
       fruits  year  time
0       apple  2001     1
1      orange  2002     4
2      banana  2001     5
3  strawberry  2008     6
4   kiwifruit  2006     3


## 8.2 Series

### 8.2.1 Series を生成する

In [4]:
import pandas as pd

fruits = {
    "banana" : 3,
    "orange" : 2,
}

print( pd.Series(fruits) )

banana    3
orange    2
dtype: int64


In [5]:
import pandas as pd

index = ["apple", "orange", "banana", "strawberry", "kiwifruit"]
data = [10, 5, 8, 12, 3]

series = pd.Series(data, index=index)

print( series )

apple         10
orange         5
banana         8
strawberry    12
kiwifruit      3
dtype: int64


### 8.2.2 参照

In [6]:
import pandas as pd

fruits = {
    "banana" : 3,
    "orange" : 4,
    "grape" : 1,
    "peach" : 5,
}

series = pd.Series(fruits)

print( series[:2] )

banana    3
orange    4
dtype: int64


In [7]:
print( series[["orange", "peach"]] )

orange    4
peach     5
dtype: int64


In [8]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']
data = [10, 5, 8, 12, 3]

series = pd.Series(data, index=index)

print( series[1:4] )

print( series[ ['apple', 'banana', 'kiwifruit'] ] )

orange         5
banana         8
strawberry    12
dtype: int64
apple        10
banana        8
kiwifruit     3
dtype: int64


### 8.2.3 データ、インデックスを取り出す

In [9]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']
data = [10, 5, 8, 12, 3]

series = pd.Series(data, index=index)

series_values = series.values
series_index = series.index

print( series_values )
print( series_index )


[10  5  8 12  3]
Index(['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'], dtype='object')


### 8.2.4 要素を追加する

In [10]:
import pandas as pd

fruits = {
    'banana' : 3, 
    'orange' : 2,
}

series = pd.Series(fruits)

print( series )

series = pd.concat([series, pd.Series([3], index=['grape'])])

print( series )

banana    3
orange    2
dtype: int64
banana    3
orange    2
grape     3
dtype: int64


In [11]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']
data = [10, 5, 8, 12, 3]

series = pd.Series(data, index=index)

print( series )

series = pd.concat([series, pd.Series([12], index=['pineapple'])])

print( series )

apple         10
orange         5
banana         8
strawberry    12
kiwifruit      3
dtype: int64
apple         10
orange         5
banana         8
strawberry    12
kiwifruit      3
pineapple     12
dtype: int64


### 8.2.5 要素を削除する

In [12]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']
data = [10, 5, 8,12, 3]

series = pd.Series(data, index=index)

series = series.drop('strawberry')

print( series )

apple        10
orange        5
banana        8
kiwifruit     3
dtype: int64


### 8.2.6 フィルタリング

In [13]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']
data = [10, 5, 8,12, 3]

series = pd.Series(data, index=index)

conditions = [True, True, False, False, False]

print( series[conditions] )

apple     10
orange     5
dtype: int64


In [14]:
print( series[series >= 5] )

apple         10
orange         5
banana         8
strawberry    12
dtype: int64


In [15]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']
data = [10, 5, 8,12, 3]

series = pd.Series(data, index=index)

print( series[5 <= series][series < 10] )

orange    5
banana    8
dtype: int64


### 8.2.7 ソート

In [16]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']
data = [10, 5, 8,12, 3]

series = pd.Series(data, index=index)

item1 = series.sort_index()
item2 = series.sort_values()

print( item1 )
print( item2 )

apple         10
banana         8
kiwifruit      3
orange         5
strawberry    12
dtype: int64
kiwifruit      3
orange         5
banana         8
apple         10
strawberry    12
dtype: int64


## 8.3 DataFrame

### 8.3.1 DataFrame の生成

In [17]:
data = {
    'fruits' :  ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'],
    'year' : [2001, 2002, 2001, 2008, 2006],
    'time' : [1, 4, 5, 6, 3],
}

df = pd.DataFrame(data)

print( df )

       fruits  year  time
0       apple  2001     1
1      orange  2002     4
2      banana  2001     5
3  strawberry  2008     6
4   kiwifruit  2006     3


In [18]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

data1 = [10, 5, 8, 12, 3]
data2 = [30, 25, 12, 10, 8]

# 1次元データ
series1 = pd.Series(data1, index=index)
series2 = pd.Series(data2, index=index)

# 2次元データ
df = pd.DataFrame([series1, series2])

print(df)

   apple  orange  banana  strawberry  kiwifruit
0     10       5       8          12          3
1     30      25      12          10          8


### 8.3.2 インデックスとカラムを設定する

In [19]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

data1 = [10, 5, 8, 12, 3]
data2 = [30, 25, 12, 10, 8]

# 1次元データ
series1 = pd.Series(data1, index=index)
series2 = pd.Series(data2, index=index)

# 2次元データ
df = pd.DataFrame([series1, series2])
df.index = [1, 2]

print(df)

   apple  orange  banana  strawberry  kiwifruit
1     10       5       8          12          3
2     30      25      12          10          8


### 8.3.3 行を追加する

In [20]:
import pandas as pd

data = {
    'fruits' : ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'],
    'time' : [1, 4, 5, 6, 3]
}

df = pd.DataFrame(data)

series = pd.Series(['mango', 2008, 7], index=['fruits', 'year', 'time'])

df = df.append(series, ignore_index=True)

print(df)

       fruits  time    year
0       apple     1     NaN
1      orange     4     NaN
2      banana     5     NaN
3  strawberry     6     NaN
4   kiwifruit     3     NaN
5       mango     7  2008.0


  df = df.append(series, ignore_index=True)


In [24]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

data1 = [10, 5, 8, 12, 3]
data2 = [30, 25, 12, 10, 8]
data3 = [30, 12, 10, 8, 25, 3]

series1 = pd.Series(data1, index=index)
series2 = pd.Series(data2, index=index)

df = pd.DataFrame([series1, series2])


index.append('pineapple')
series3 = pd.Series(data3, index=index)

df = df.append(series3, ignore_index=True)

print(df)

   apple  orange  banana  strawberry  kiwifruit  pineapple
0     10       5       8          12          3        NaN
1     30      25      12          10          8        NaN
2     30      12      10           8         25        3.0


  df = df.append(series3, ignore_index=True)


### 8.3.4 列を追加する

In [26]:
import pandas as pd

data = {
    'fruits': ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'],
    'year': [2001, 2002, 2001, 2008, 2006],
    'time': [1, 4, 5 , 6, 3]
}

df = pd.DataFrame(data)

df['price'] = [150, 120, 100, 300, 150]

print(df)

       fruits  year  time  price
0       apple  2001     1    150
1      orange  2002     4    120
2      banana  2001     5    100
3  strawberry  2008     6    300
4   kiwifruit  2006     3    150


In [29]:
import pandas as pd

index = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

data1 = [10, 5, 8, 12, 3]
data2 = [30, 25, 12, 10, 8]

series1 = pd.Series(data1, index=index)
series2 = pd.Series(data2, index=index)

new_colomn = pd.Series([15, 7], index=[0, 1])

df = pd.DataFrame([series1, series2])

df['mango'] = new_colomn

print(df)

   apple  orange  banana  strawberry  kiwifruit  mango
0     10       5       8          12          3     15
1     30      25      12          10          8      7


### 8.3.5 データの参照

### 8.3.6 名前による参照

In [30]:
import pandas as pd

data = {
    'fruits': ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'],
    'year': [2001, 2002, 2001, 2008, 2006],
    'time': [1, 4, 5, 6, 3],
}

df = pd.DataFrame(data)

print(df)

       fruits  year  time
0       apple  2001     1
1      orange  2002     4
2      banana  2001     5
3  strawberry  2008     6
4   kiwifruit  2006     3


In [31]:
df = df.loc[[1,2], ['time', 'year']]

print(df)

   time  year
1     4  2002
2     5  2001


In [42]:
import numpy as np
import pandas as pd

np.random.seed(0)

columns = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

df = pd.DataFrame()

for column in columns:
    df[column] = np.random.choice(range(1, 11), 10)
    
df.index = range(1, 11)

df = df.loc[range(2, 6), ['banana', 'kiwifruit']]

print(df)



   banana  kiwifruit
2      10         10
3       9          1
4      10          5
5       5          8


### 8.3.7 番号による参照

In [43]:
import pandas as pd

data = {
    'fruits': ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'],
    'year': [2001, 2002, 2001, 2008, 2006],
    'time': [1, 4, 5, 6, 3],
}

df = pd.DataFrame(data)

print(df)

       fruits  year  time
0       apple  2001     1
1      orange  2002     4
2      banana  2001     5
3  strawberry  2008     6
4   kiwifruit  2006     3


In [44]:
df = df.iloc[[1,3], [0, 2]]

print(df)

       fruits  time
1      orange     4
3  strawberry     6


In [48]:
import numpy as np
import pandas as pd

np.random.seed(0)

columns = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

df = pd.DataFrame()

for column in columns:
    df[column] = np.random.choice(range(1, 11), 10)

df.index = range(1, 11)

df = df.iloc[range(1, 5), [2, 4]]
 
print(df)

   banana  kiwifruit
2      10         10
3       9          1
4      10          5
5       5          8


### 8.3.8 行 または 列を削除

In [56]:
import pandas as pd

data = {
    'fruits': ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'],
    'time': [1, 4, 5, 6, 3],
    'year': [2001, 2002, 2001, 2008, 2006],
}

df = pd.DataFrame(data)

print(df)

print()

df_1 = df.drop(range(0, 2))

print(df_1)

print()

df_2 = df.drop('year', axis=1)

print(df_2)

       fruits  time  year
0       apple     1  2001
1      orange     4  2002
2      banana     5  2001
3  strawberry     6  2008
4   kiwifruit     3  2006

       fruits  time  year
2      banana     5  2001
3  strawberry     6  2008
4   kiwifruit     3  2006

       fruits  time
0       apple     1
1      orange     4
2      banana     5
3  strawberry     6
4   kiwifruit     3


In [60]:
import numpy as np
import pandas as pd

columns = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

df = pd.DataFrame()

for column in columns:
    df[column] = np.random.choice(range(1, 11), 10)
    
df.index = range(1, 11)

df_1 = df.drop(range(2, 11, 2))

print(df_1)
print()

df_2 = df.drop('strawberry', axis=1)

print(df_2)

   apple  orange  banana  strawberry  kiwifruit
1     10      10       2           6          5
3      6       8       3           1          1
5      5       3       2           2          4
7      4      10       9           1          4
9      8       8       1           9          2

    apple  orange  banana  kiwifruit
1      10      10       2          5
2       8      10       3          5
3       6       8       3          1
4       4       4       9         10
5       5       3       2          4
6       6       4       6          8
7       4      10       9          4
8       4       8       5          3
9       8       8       1          2
10     10       6       3          2


### 8.3.9 ソート

In [64]:
import pandas as pd

data = {
    'fruits': ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'],
    'time': [1, 4, 5, 6, 3],
    'year': [2001, 2002, 2001, 2000, 2006],
}

df = pd.DataFrame(data)

print(df)
print()

df = df.sort_values(by=['year'], ascending=True)

print(df)
print()

df = df.sort_values(by=['time', 'year'], ascending=True)

print(df)
print()

       fruits  time  year
0       apple     1  2001
1      orange     4  2002
2      banana     5  2001
3  strawberry     6  2000
4   kiwifruit     3  2006

       fruits  time  year
3  strawberry     6  2000
0       apple     1  2001
2      banana     5  2001
1      orange     4  2002
4   kiwifruit     3  2006

       fruits  time  year
0       apple     1  2001
4   kiwifruit     3  2006
1      orange     4  2002
2      banana     5  2001
3  strawberry     6  2000



In [68]:
import numpy as np
import pandas as pd

np.random.seed(0)

columns = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

df = pd.DataFrame()

for column in columns:
    df[column] = np.random.choice(range(1, 11), 10)
    
df.index = range(1, 11)

print(df)
print()

df = df.sort_values(by=columns, ascending=True)

print(df)




    apple  orange  banana  strawberry  kiwifruit
1       6       8       6           3         10
2       1       7      10           4         10
3       4       9       9           9          1
4       4       9      10           2          5
5       8       2       5           4          8
6      10       7       4           4          4
7       4       8       1           4          3
8       6       8       4           8          8
9       3       9       6           1          3
10      5       2       1           2          1

    apple  orange  banana  strawberry  kiwifruit
2       1       7      10           4         10
9       3       9       6           1          3
7       4       8       1           4          3
3       4       9       9           9          1
4       4       9      10           2          5
10      5       2       1           2          1
8       6       8       4           8          8
1       6       8       6           3         10
5       8       2  

### 8.3.10 フィルタリング

In [70]:
import pandas as pd

data = {
    'fruits': ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit'],
    'year': [2001, 2002, 2001, 2008, 2006],
    'time': [1, 4, 5, 6, 3],
}

df = pd.DataFrame(data)

print(df.index % 2 == 0)
print()
print(df[df.index % 2 == 0])

[ True False  True False  True]

      fruits  year  time
0      apple  2001     1
2     banana  2001     5
4  kiwifruit  2006     3


In [75]:
import numpy as np
import pandas as pd

np.random.seed(0)

columns = ['apple', 'orange', 'banana', 'strawberry', 'kiwifruit']

df = pd.DataFrame()

for column in columns:
    df[column] = np.random.choice(range(1, 11), 10)

df.index = range(1, 11)

df = df.loc[df['apple'] >= 5][df['kiwifruit'] >= 5]

print(df)


   apple  orange  banana  strawberry  kiwifruit
1      6       8       6           3         10
5      8       2       5           4          8
8      6       8       4           8          8


  df = df.loc[df['apple'] >= 5][df['kiwifruit'] >= 5]


## 添削問題

In [91]:
import numpy as np
import pandas as pd

index = ['growth', 'mission', 'ishikawa', 'pro']

data = [50, 7, 26, 1]

series = pd.Series(data, index=index)

print(series)
print()

# インデックスについてアルファベット順にソートした series を aidemy に代入する
aidemy = series.sort_index()

print(aidemy)
print()

# series にインデックスが 'tutor'、データが 30 の要素を追加してください
aidemy1 = pd.Series([30], index=['tutor'])
aidemy2 = series.append(aidemy1)

print(aidemy)
print()
print(aidemy2)
print()

# DataFrameを生成し、列を追加します
df = pd.DataFrame()

for i in index:
    df[i] = np.random.choice(range(1, 11), 10)

df.index = range(1, 11)

print(df)
print()

# loc[]を使って df の2行目から5行目までの4行と、'ishikawa' を含むDataFrameをaidemy3に代入する

aidemy3 = df.loc[range(2, 6), ['ishikawa']]

print(aidemy3)



growth      50
mission      7
ishikawa    26
pro          1
dtype: int64

growth      50
ishikawa    26
mission      7
pro          1
dtype: int64

growth      50
ishikawa    26
mission      7
pro          1
dtype: int64

growth      50
mission      7
ishikawa    26
pro          1
tutor       30
dtype: int64

    growth  mission  ishikawa  pro
1        3        8        10    5
2        2        7        10    9
3        5        2         3    5
4        3        7         1    4
5        6        8        10    4
6        6        3         2    9
7        6        4        10    9
8        3        2         1    8
9        6       10         7    1
10       8        6         1    4

   ishikawa
2        10
3         3
4         1
5        10


  aidemy2 = series.append(aidemy1)
