In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# DataFrame创建

参数：
- data:  行列数据
- index：行标签。如果没有传入索引参数，则默认会自动创建一个从0-N的整数索引
- columns：列标签。如果没有传入索引参数，则默认会自动创建一个从0-N的整数索引
- dtype：数据的类型

## 指定内容创建

In [2]:
pd.DataFrame(np.random.rand(2, 3))

Unnamed: 0,0,1,2
0,0.848998,0.865933,0.713458
1,0.089094,0.157259,0.451848


## 通过已有数据创建

In [3]:
# 生成10名同学，5门功课的数据
score = np.random.randint(40, 100, (10, 5))
score

array([[67, 91, 84, 45, 96],
       [41, 46, 60, 97, 70],
       [51, 49, 60, 97, 71],
       [49, 95, 96, 53, 52],
       [75, 97, 95, 75, 48],
       [60, 87, 45, 81, 64],
       [46, 55, 40, 56, 73],
       [50, 69, 88, 87, 89],
       [48, 71, 67, 68, 55],
       [62, 48, 68, 61, 65]])

In [4]:
pd.DataFrame(score)

Unnamed: 0,0,1,2,3,4
0,67,91,84,45,96
1,41,46,60,97,70
2,51,49,60,97,71
3,49,95,96,53,52
4,75,97,95,75,48
5,60,87,45,81,64
6,46,55,40,56,73
7,50,69,88,87,89
8,48,71,67,68,55
9,62,48,68,61,65


## 指定column和index

In [5]:
# 构造列索引序列
subjects = ["Chinese", "Math", "English", "Sciencs", "Sports"]
# 构造行索引序列
students = [f"stu_{i+1}" for i in range(score.shape[0])]
# 添加行索引
pd.DataFrame(score, columns=subjects, index=students)

Unnamed: 0,Chinese,Math,English,Sciencs,Sports
stu_1,67,91,84,45,96
stu_2,41,46,60,97,70
stu_3,51,49,60,97,71
stu_4,49,95,96,53,52
stu_5,75,97,95,75,48
stu_6,60,87,45,81,64
stu_7,46,55,40,56,73
stu_8,50,69,88,87,89
stu_9,48,71,67,68,55
stu_10,62,48,68,61,65


### index和column可以重复

In [9]:
# 添加行索引
repeatd_df = pd.DataFrame(np.random.randint(1, 10, 8).reshape(4, 2), columns=["a", "a"], index=[1, 1, 2, 3])
repeatd_df

Unnamed: 0,a,a.1
1,2,4
1,1,5
2,8,9
3,8,6


#### df.columns.is_unique 判断index是否重复

In [11]:
repeatd_df.columns.is_unique
# False 代表有重复

False

#### df.index.is_unique 判断index是否重复

In [12]:
repeatd_df.index.is_unique
# False 代表有重复

False

## 通过字典创建

In [56]:
pd.DataFrame({"x": np.random.rand(5), "y": np.random.randint(1, 10, 5)})

Unnamed: 0,x,y
0,0.995877,6
1,0.514333,5
2,0.102707,8
3,0.892692,5
4,0.308904,7


# 基本属性

In [57]:
# 构造列索引序列
subjects = ["Chinese", "Math", "English", "Sciencs", "Sports"]
# 构造行索引序列
students = [f"stu_{i+1}" for i in range(score.shape[0])]
# 添加行列索引
score_df = pd.DataFrame(score, columns=subjects, index=students)
score_df

Unnamed: 0,Chinese,Math,English,Sciencs,Sports
stu_1,67,76,45,54,72
stu_2,87,51,50,72,87
stu_3,81,71,52,57,56
stu_4,50,62,84,74,79
stu_5,41,59,87,94,94
stu_6,71,97,96,76,67
stu_7,54,40,58,65,63
stu_8,85,81,57,43,60
stu_9,48,74,44,87,58
stu_10,66,92,64,82,54


## df.values 返回数据数组

In [58]:
score_df.values

array([[67, 76, 45, 54, 72],
       [87, 51, 50, 72, 87],
       [81, 71, 52, 57, 56],
       [50, 62, 84, 74, 79],
       [41, 59, 87, 94, 94],
       [71, 97, 96, 76, 67],
       [54, 40, 58, 65, 63],
       [85, 81, 57, 43, 60],
       [48, 74, 44, 87, 58],
       [66, 92, 64, 82, 54]])

## df.shape

In [59]:
score_df.shape

(10, 5)

## df.columns

In [60]:
score_df.columns

Index(['Chinese', 'Math', 'English', 'Sciencs', 'Sports'], dtype='object')

In [61]:
score_df.columns[2:5]

Index(['English', 'Sciencs', 'Sports'], dtype='object')

### df.columns.get_indexer(["column_name"]) 返回column的数字索引

In [71]:
score_df.columns.get_indexer(["Chinese", "Sciencs"])

array([0, 3], dtype=int64)

In [74]:
# 列不存在返回-1
score_df.columns.get_indexer(["Chinesesss"])

array([-1], dtype=int64)

## df.index

In [62]:
score_df.index

Index(['stu_1', 'stu_2', 'stu_3', 'stu_4', 'stu_5', 'stu_6', 'stu_7', 'stu_8',
       'stu_9', 'stu_10'],
      dtype='object')

In [63]:
score_df.index[3:6]

Index(['stu_4', 'stu_5', 'stu_6'], dtype='object')

### df.index.get_indexer(["index_name"]) 返回index的数字索引

In [73]:
score_df.index.get_indexer(["stu_3", "stu_6"])

array([2, 5], dtype=int64)

In [75]:
# 行不存在返回-1
score_df.index.get_indexer(["stu_31"])

array([-1], dtype=int64)

## df.dtypes

In [64]:
score_df.dtypes

Chinese    int32
Math       int32
English    int32
Sciencs    int32
Sports     int32
dtype: object

## df.attrs

In [65]:
score_df.attrs

{}

## df.axes

In [66]:
score_df.axes

[Index(['stu_1', 'stu_2', 'stu_3', 'stu_4', 'stu_5', 'stu_6', 'stu_7', 'stu_8',
        'stu_9', 'stu_10'],
       dtype='object'),
 Index(['Chinese', 'Math', 'English', 'Sciencs', 'Sports'], dtype='object')]

## df.T 转置数据

In [67]:
score_df.T

Unnamed: 0,stu_1,stu_2,stu_3,stu_4,stu_5,stu_6,stu_7,stu_8,stu_9,stu_10
Chinese,67,87,81,50,41,71,54,85,48,66
Math,76,51,71,62,59,97,40,81,74,92
English,45,50,52,84,87,96,58,57,44,64
Sciencs,54,72,57,74,94,76,65,43,87,82
Sports,72,87,56,79,94,67,63,60,58,54


# 基本方法

## df.info() 字段描述

In [68]:
score_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, stu_1 to stu_10
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Chinese  10 non-null     int32
 1   Math     10 non-null     int32
 2   English  10 non-null     int32
 3   Sciencs  10 non-null     int32
 4   Sports   10 non-null     int32
dtypes: int32(5)
memory usage: 580.0+ bytes


## df.describe() 字段数据统计

In [69]:
score_df.describe()

Unnamed: 0,Chinese,Math,English,Sciencs,Sports
count,10.0,10.0,10.0,10.0,10.0
mean,65.0,70.3,63.7,70.4,69.0
std,16.302692,17.739159,18.672916,15.784662,13.71941
min,41.0,40.0,44.0,43.0,54.0
25%,51.0,59.75,50.5,59.0,58.5
50%,66.5,72.5,57.5,73.0,65.0
75%,78.5,79.75,79.0,80.5,77.25
max,87.0,97.0,96.0,94.0,94.0
