In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# DataFrame

## DataFrame的创建

In [2]:
# 参数：
#     data:  数据
#     index：行标签。如果没有传入索引参数，则默认会自动创建一个从0-N的整数索引。
#     columns：列标签。如果没有传入索引参数，则默认会自动创建一个从0-N的整数索引。
pd.DataFrame(np.random.rand(2, 3))

Unnamed: 0,0,1,2
0,0.275681,0.641578,0.748512
1,0.718747,0.380215,0.510813


In [3]:
# 生成10名同学，5门功课的数据
score = np.random.randint(40, 100, (10, 5))
score

array([[40, 44, 51, 58, 92],
       [41, 59, 88, 57, 93],
       [80, 81, 75, 59, 67],
       [51, 52, 70, 41, 48],
       [90, 54, 68, 87, 47],
       [84, 87, 67, 71, 80],
       [49, 42, 78, 99, 87],
       [41, 41, 68, 98, 94],
       [42, 93, 87, 52, 72],
       [69, 45, 51, 41, 86]])

In [4]:
# 使用Pandas中的数据结构 更容易读
score_df = pd.DataFrame(score)
score_df

Unnamed: 0,0,1,2,3,4
0,40,44,51,58,92
1,41,59,88,57,93
2,80,81,75,59,67
3,51,52,70,41,48
4,90,54,68,87,47
5,84,87,67,71,80
6,49,42,78,99,87
7,41,41,68,98,94
8,42,93,87,52,72
9,69,45,51,41,86


## 基本属性

In [5]:
score_df.shape

(10, 5)

In [6]:
score_df.index

RangeIndex(start=0, stop=10, step=1)

In [7]:
score_df.columns

RangeIndex(start=0, stop=5, step=1)

In [8]:
score_df.dtypes

0    int32
1    int32
2    int32
3    int32
4    int32
dtype: object

In [9]:
score_df.attrs

{}

In [10]:
score_df.axes

[RangeIndex(start=0, stop=10, step=1), RangeIndex(start=0, stop=5, step=1)]

#### 直接创建

In [18]:
# 构造列索引序列
subjects = ["Chinese", "Math", "English", "Politics"]
# 构造行索引序列
stu = ['同学' + str(i) for i in range(score_df.shape[0])]
# 添加行索引
data = pd.DataFrame(score[:, :-1], index=stu, columns=subjects)
data

Unnamed: 0,Chinese,Math,English,Politics
同学0,40,44,51,58
同学1,41,59,88,57
同学2,80,81,75,59
同学3,51,52,70,41
同学4,90,54,68,87
同学5,84,87,67,71
同学6,49,42,78,99
同学7,41,41,68,98
同学8,42,93,87,52
同学9,69,45,51,41


### df["列名"] 与 df.列名 获取列

In [19]:
data["Chinese"]

同学0    40
同学1    41
同学2    80
同学3    51
同学4    90
同学5    84
同学6    49
同学7    41
同学8    42
同学9    69
Name: Chinese, dtype: int32

In [20]:
data.Chinese

同学0    40
同学1    41
同学2    80
同学3    51
同学4    90
同学5    84
同学6    49
同学7    41
同学8    42
同学9    69
Name: Chinese, dtype: int32

### df["列名"] = 新数据 直接添加新列

In [21]:
data["Sport"] = score[:, -1]
data

Unnamed: 0,Chinese,Math,English,Politics,Sport
同学0,40,44,51,58,92
同学1,41,59,88,57,93
同学2,80,81,75,59,67
同学3,51,52,70,41,48
同学4,90,54,68,87,47
同学5,84,87,67,71,80
同学6,49,42,78,99,87
同学7,41,41,68,98,94
同学8,42,93,87,52,72
同学9,69,45,51,41,86


## 通过字典创建

In [22]:
# 字典的key就是列名
df = pd.DataFrame({"x":np.random.randn(500), "y":np.random.randn(500)})
df.head()

Unnamed: 0,x,y
0,0.963628,0.023356
1,-0.695791,0.628499
2,-1.822422,-0.625681
3,0.455577,1.724668
4,0.147746,-0.124848


## DataFrame属性 

### a.shape 形状

In [23]:
data.shape

(10, 5)

### a.columns 列索引

In [24]:
data.columns

Index(['Chinese', 'Math', 'English', 'Politics', 'Sport'], dtype='object')

#### columns可以直接 = 赋值

In [25]:
data.columns = ["语文", "数学", "英语", "政治", "体育"]
data

Unnamed: 0,语文,数学,英语,政治,体育
同学0,40,44,51,58,92
同学1,41,59,88,57,93
同学2,80,81,75,59,67
同学3,51,52,70,41,48
同学4,90,54,68,87,47
同学5,84,87,67,71,80
同学6,49,42,78,99,87
同学7,41,41,68,98,94
同学8,42,93,87,52,72
同学9,69,45,51,41,86


In [26]:
# 支持切片
data.columns[:4]

Index(['语文', '数学', '英语', '政治'], dtype='object')

### a.index 行索引

In [27]:
data.index

Index(['同学0', '同学1', '同学2', '同学3', '同学4', '同学5', '同学6', '同学7', '同学8', '同学9'], dtype='object')

In [28]:
# 支持切片
data.index[3:6]

Index(['同学3', '同学4', '同学5'], dtype='object')

### a.values 数据

In [29]:
data.values

array([[40, 44, 51, 58, 92],
       [41, 59, 88, 57, 93],
       [80, 81, 75, 59, 67],
       [51, 52, 70, 41, 48],
       [90, 54, 68, 87, 47],
       [84, 87, 67, 71, 80],
       [49, 42, 78, 99, 87],
       [41, 41, 68, 98, 94],
       [42, 93, 87, 52, 72],
       [69, 45, 51, 41, 86]])

In [30]:
# 支持切片
data.values[:4, :4]

array([[40, 44, 51, 58],
       [41, 59, 88, 57],
       [80, 81, 75, 59],
       [51, 52, 70, 41]])

### a.T 转置

In [31]:
data.T

Unnamed: 0,同学0,同学1,同学2,同学3,同学4,同学5,同学6,同学7,同学8,同学9
语文,40,41,80,51,90,84,49,41,42,69
数学,44,59,81,52,54,87,42,41,93,45
英语,51,88,75,70,68,67,78,68,87,51
政治,58,57,59,41,87,71,99,98,52,41
体育,92,93,67,48,47,80,87,94,72,86


### a.head/tail() 首尾

In [32]:
data.head()

Unnamed: 0,语文,数学,英语,政治,体育
同学0,40,44,51,58,92
同学1,41,59,88,57,93
同学2,80,81,75,59,67
同学3,51,52,70,41,48
同学4,90,54,68,87,47


In [33]:
# 效果相同
data[:5]

Unnamed: 0,语文,数学,英语,政治,体育
同学0,40,44,51,58,92
同学1,41,59,88,57,93
同学2,80,81,75,59,67
同学3,51,52,70,41,48
同学4,90,54,68,87,47


In [34]:
data.tail(4)

Unnamed: 0,语文,数学,英语,政治,体育
同学6,49,42,78,99,87
同学7,41,41,68,98,94
同学8,42,93,87,52,72
同学9,69,45,51,41,86


## DatatFrame索引的设置

### 修改行列索引值 index 必须整体全部修改

In [35]:
stu = ["学生_" + str(i) for i in range(score_df.shape[0])]
# 必须整体全部修改
data.index = stu
data

Unnamed: 0,语文,数学,英语,政治,体育
学生_0,40,44,51,58,92
学生_1,41,59,88,57,93
学生_2,80,81,75,59,67
学生_3,51,52,70,41,48
学生_4,90,54,68,87,47
学生_5,84,87,67,71,80
学生_6,49,42,78,99,87
学生_7,41,41,68,98,94
学生_8,42,93,87,52,72
学生_9,69,45,51,41,86


In [36]:
# 报错,单个值不能设置
data.index[0] = 'a'

TypeError: Index does not support mutable operations

### 重设索引	reset_index()
set_index(keys, drop=True)

- 默认添加一列,从0开始
- keys : 列索引名成或者列索引名称的列表
- drop : boolean, 
    - default False添加一列,从0开始
    - True 当做新的索引，删除原来的索引

In [37]:
data.reset_index()

Unnamed: 0,index,语文,数学,英语,政治,体育
0,学生_0,40,44,51,58,92
1,学生_1,41,59,88,57,93
2,学生_2,80,81,75,59,67
3,学生_3,51,52,70,41,48
4,学生_4,90,54,68,87,47
5,学生_5,84,87,67,71,80
6,学生_6,49,42,78,99,87
7,学生_7,41,41,68,98,94
8,学生_8,42,93,87,52,72
9,学生_9,69,45,51,41,86


In [38]:
data.reset_index(drop=True)

Unnamed: 0,语文,数学,英语,政治,体育
0,40,44,51,58,92
1,41,59,88,57,93
2,80,81,75,59,67
3,51,52,70,41,48
4,90,54,68,87,47
5,84,87,67,71,80
6,49,42,78,99,87
7,41,41,68,98,94
8,42,93,87,52,72
9,69,45,51,41,86


### 以某列值设置为新的索引 set_index()
set_index(keys, drop=True)

- 默认添加一列,从0开始
- keys : 列索引名成或者列索引名称的列表
- drop : boolean,
    - default True.当做新的索引，删除原来的列

In [39]:
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2014, 2013, 2014],
                    'sale':[55, 40, 84, 31]})
df

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


In [40]:
# 以月份设置新的索引
# month指的是列表名字
df.set_index('month')

Unnamed: 0_level_0,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2012,55
4,2014,40
7,2013,84
10,2014,31


In [41]:
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2014, 2013, 2014],
                    'sale':[55, 40, 84, 31]})
# 设置多个索引，以年和月份
# #year,month指的是列表名字
df = df.set_index(['year', 'month'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2012,1,55
2014,4,40
2013,7,84
2014,10,31


# DataFrame添加,删除列

In [42]:
a = np.random.random(100)
a

array([6.21164276e-01, 2.06950353e-01, 9.22663596e-01, 3.61705680e-01,
       6.24308353e-01, 9.33083298e-01, 5.81129649e-01, 2.63505299e-01,
       2.50191412e-01, 9.53590831e-01, 1.78705267e-01, 7.17756182e-01,
       6.65984986e-01, 1.32715655e-01, 3.65808050e-01, 2.92350434e-01,
       1.49241711e-01, 4.19999481e-01, 1.09025972e-01, 5.92591130e-01,
       6.75859489e-01, 2.37245484e-01, 5.39955239e-01, 6.14506071e-01,
       1.44219606e-02, 3.17670909e-01, 8.42275208e-02, 7.18274381e-01,
       3.56567118e-01, 5.05537721e-01, 2.52019642e-01, 1.65109822e-01,
       5.09614241e-01, 5.30540359e-01, 1.45461083e-01, 6.75506622e-01,
       1.24446966e-01, 1.75112981e-01, 9.46267128e-01, 1.84996055e-01,
       8.20167808e-01, 8.79235577e-01, 1.62877711e-01, 4.84582093e-01,
       8.15105447e-01, 3.29625502e-02, 5.25797622e-01, 4.12571851e-01,
       8.06165430e-01, 9.53947805e-01, 2.43010161e-01, 1.68513194e-01,
       1.96770896e-01, 5.98255121e-01, 1.76488780e-01, 6.40190665e-01,
      

In [43]:
pd1 = pd.DataFrame(a, columns=["a"])
pd1

Unnamed: 0,a
0,0.621164
1,0.206950
2,0.922664
3,0.361706
4,0.624308
...,...
95,0.745028
96,0.312955
97,0.520824
98,0.108035


In [44]:
id = np.arange(1, 101, 1)
id

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

## 插入列 pd.insert(loc=第几列, column="列名", value=值)

In [45]:
pd1.insert(loc=0, column="id", value=id)
pd1

Unnamed: 0,id,a
0,1,0.621164
1,2,0.206950
2,3,0.922664
3,4,0.361706
4,5,0.624308
...,...,...
95,96,0.745028
96,97,0.312955
97,98,0.520824
98,99,0.108035


In [46]:
pd1.insert(loc=1, column="id1", value=id)
pd1

Unnamed: 0,id,id1,a
0,1,1,0.621164
1,2,2,0.206950
2,3,3,0.922664
3,4,4,0.361706
4,5,5,0.624308
...,...,...,...
95,96,96,0.745028
96,97,97,0.312955
97,98,98,0.520824
98,99,99,0.108035


## 删除列或者行 df.drop
- labels 单个标签或者标签列表
- axis=0 默认 删除index
- axis=1 指定删除列
- inplace=True 修改原数据
- level 针对多重索引 指定级别
- index 指定索引
- columns 指定列名

### 删除行 pd.drop(axis=0, index=行], inplace=True)

In [47]:
# 删除前2行
pd1.drop(axis=0, index=[0, 1], inplace=True)
pd1

Unnamed: 0,id,id1,a
2,3,3,0.922664
3,4,4,0.361706
4,5,5,0.624308
5,6,6,0.933083
6,7,7,0.581130
...,...,...,...
95,96,96,0.745028
96,97,97,0.312955
97,98,98,0.520824
98,99,99,0.108035


### pd.drop(axis=1, columns=列名, inplace=True)

In [48]:
# 删除id1列
pd1.drop(axis=1, columns=["id1"], inplace=True)
pd1

Unnamed: 0,id,a
2,3,0.922664
3,4,0.361706
4,5,0.624308
5,6,0.933083
6,7,0.581130
...,...,...
95,96,0.745028
96,97,0.312955
97,98,0.520824
98,99,0.108035
