# Pandas数据结构

## Series

In [3]:
import pandas as pd
import numpy as np

In [4]:
pd.Series(np.arange(10))

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [5]:
pd.Series([6.7,5.6,3,10,2], index=[1,2,3,4,5])

1     6.7
2     5.6
3     3.0
4    10.0
5     2.0
dtype: float64

In [7]:
color_count = pd.Series({'red':100, 'blue':200, 'green': 500, 'yellow':1000})
color_count

red        100
blue       200
green      500
yellow    1000
dtype: int64

## Series的属性

### index

In [8]:
color_count.index

Index(['red', 'blue', 'green', 'yellow'], dtype='object')

### values

In [9]:
color_count.values

array([ 100,  200,  500, 1000])

### 索引

In [10]:
color_count[2]

500

# DataFrame

## DataFrame的创建

In [12]:
pd.DataFrame(np.random.randn(2, 3))

Unnamed: 0,0,1,2
0,1.423124,0.356507,-0.077492
1,2.048017,-0.851773,0.897801


In [13]:
score = np.random.randint(40, 100, (10, 5))

In [16]:
score

array([[47, 58, 79, 42, 53],
       [55, 82, 98, 78, 42],
       [54, 43, 66, 48, 96],
       [82, 47, 60, 47, 45],
       [86, 58, 51, 46, 60],
       [74, 85, 48, 82, 91],
       [52, 78, 45, 79, 40],
       [71, 53, 87, 99, 82],
       [61, 42, 73, 85, 58],
       [80, 98, 60, 56, 56]])

In [18]:
score_df = pd.DataFrame(score)

In [27]:
subjects = ["语文", "数学", "英语", "政治", "体育"]

# 构造列索引序列
stu = ['同学' + str(i) for i in range(score.shape[0])]

# 添加行索引
data = pd.DataFrame(score, columns=subjects, index=stu)

In [28]:
data

Unnamed: 0,语文,数学,英语,政治,体育
同学0,47,58,79,42,53
同学1,55,82,98,78,42
同学2,54,43,66,48,96
同学3,82,47,60,47,45
同学4,86,58,51,46,60
同学5,74,85,48,82,91
同学6,52,78,45,79,40
同学7,71,53,87,99,82
同学8,61,42,73,85,58
同学9,80,98,60,56,56


## DataFrame的属性

In [31]:
data.shape

(10, 5)

In [34]:
data.index

Index(['同学0', '同学1', '同学2', '同学3', '同学4', '同学5', '同学6', '同学7', '同学8', '同学9'], dtype='object')

In [35]:
data.columns

Index(['语文', '数学', '英语', '政治', '体育'], dtype='object')

In [37]:
data.values

array([[47, 58, 79, 42, 53],
       [55, 82, 98, 78, 42],
       [54, 43, 66, 48, 96],
       [82, 47, 60, 47, 45],
       [86, 58, 51, 46, 60],
       [74, 85, 48, 82, 91],
       [52, 78, 45, 79, 40],
       [71, 53, 87, 99, 82],
       [61, 42, 73, 85, 58],
       [80, 98, 60, 56, 56]])

In [38]:
data.T

Unnamed: 0,同学0,同学1,同学2,同学3,同学4,同学5,同学6,同学7,同学8,同学9
语文,47,55,54,82,86,74,52,71,61,80
数学,58,82,43,47,58,85,78,53,42,98
英语,79,98,66,60,51,48,45,87,73,60
政治,42,78,48,47,46,82,79,99,85,56
体育,53,42,96,45,60,91,40,82,58,56


In [39]:
data.head()

Unnamed: 0,语文,数学,英语,政治,体育
同学0,47,58,79,42,53
同学1,55,82,98,78,42
同学2,54,43,66,48,96
同学3,82,47,60,47,45
同学4,86,58,51,46,60


In [40]:
data.head(3)

Unnamed: 0,语文,数学,英语,政治,体育
同学0,47,58,79,42,53
同学1,55,82,98,78,42
同学2,54,43,66,48,96


In [41]:
data.tail()

Unnamed: 0,语文,数学,英语,政治,体育
同学5,74,85,48,82,91
同学6,52,78,45,79,40
同学7,71,53,87,99,82
同学8,61,42,73,85,58
同学9,80,98,60,56,56


In [43]:
data.tail(3)

Unnamed: 0,语文,数学,英语,政治,体育
同学7,71,53,87,99,82
同学8,61,42,73,85,58
同学9,80,98,60,56,56


## DatatFrame索引的设置

### 修改行列索引值

In [44]:
stu = ["学生_" + str(i) for i in range(score_df.shape[0])]

# 必须整体全部修改
data.index = stu

In [45]:
data

Unnamed: 0,语文,数学,英语,政治,体育
学生_0,47,58,79,42,53
学生_1,55,82,98,78,42
学生_2,54,43,66,48,96
学生_3,82,47,60,47,45
学生_4,86,58,51,46,60
学生_5,74,85,48,82,91
学生_6,52,78,45,79,40
学生_7,71,53,87,99,82
学生_8,61,42,73,85,58
学生_9,80,98,60,56,56


In [47]:
# 错误修改方式
# data.index[3] = '学生_3'

### 重设索引

In [48]:
# 重置索引,drop=False
data.reset_index()

Unnamed: 0,index,语文,数学,英语,政治,体育
0,学生_0,47,58,79,42,53
1,学生_1,55,82,98,78,42
2,学生_2,54,43,66,48,96
3,学生_3,82,47,60,47,45
4,学生_4,86,58,51,46,60
5,学生_5,74,85,48,82,91
6,学生_6,52,78,45,79,40
7,学生_7,71,53,87,99,82
8,学生_8,61,42,73,85,58
9,学生_9,80,98,60,56,56


In [49]:
data

Unnamed: 0,语文,数学,英语,政治,体育
学生_0,47,58,79,42,53
学生_1,55,82,98,78,42
学生_2,54,43,66,48,96
学生_3,82,47,60,47,45
学生_4,86,58,51,46,60
学生_5,74,85,48,82,91
学生_6,52,78,45,79,40
学生_7,71,53,87,99,82
学生_8,61,42,73,85,58
学生_9,80,98,60,56,56


In [50]:
# 重置索引,drop=True
data.reset_index(drop=True)

Unnamed: 0,语文,数学,英语,政治,体育
0,47,58,79,42,53
1,55,82,98,78,42
2,54,43,66,48,96
3,82,47,60,47,45
4,86,58,51,46,60
5,74,85,48,82,91
6,52,78,45,79,40
7,71,53,87,99,82
8,61,42,73,85,58
9,80,98,60,56,56


### 以某列值设置为新的索引

In [53]:
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2014, 2013, 2014],
                    'sale':[55, 40, 84, 31]})

In [54]:
df

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


In [55]:
df.set_index('month')

Unnamed: 0_level_0,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2012,55
4,2014,40
7,2013,84
10,2014,31


In [56]:
# 这样DataFrame就变成了一个具有MultiIndex的DataFrame。
df = df.set_index(['year', 'month'])

In [58]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2012,1,55
2014,4,40
2013,7,84
2014,10,31


# MultiIndex与Panel

## MultiIndex

### multiIndex的特性

In [60]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2012,1,55
2014,4,40
2013,7,84
2014,10,31


In [61]:
df.index

MultiIndex([(2012,  1),
            (2014,  4),
            (2013,  7),
            (2014, 10)],
           names=['year', 'month'])

In [62]:
df.index.names

FrozenList(['year', 'month'])

In [63]:
df.index.levels

FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])

### multiIndex的创建

In [66]:
arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
d = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))

In [67]:
d

MultiIndex([(1,  'red'),
            (1, 'blue'),
            (2,  'red'),
            (2, 'blue')],
           names=['number', 'color'])