## 2. DataFrame
### 2.1 DataFrame的创建

In [71]:
import pandas as pd
import numpy as np

# 通过Series创建
s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([6, 7, 8, 9, 10])
df = pd.DataFrame({'c1': s1, 'c2': s2})
print(df, type(df))
print(df['c1'], type(df['c1']))

   c1  c2
0   1   6
1   2   7
2   3   8
3   4   9
4   5  10 <class 'pandas.core.frame.DataFrame'>
0    1
1    2
2    3
3    4
4    5
Name: c1, dtype: int64 <class 'pandas.core.series.Series'>


In [72]:
# 通过字典创建
df = pd.DataFrame(
    {
        # 'id': [1, 2, 3, 4, 5],
        'name': ['Jeremy', 'Sean', 'Jason', 'Amanda', 'Taylor'],
        'age': [28, 30, 10, 15, 30],
        'score': [100.0, 40.5, 10.7, 80.4, 90.2]
    },
    index=[i for i in range(1, 6)],  # 索引
    columns=['name', 'score', 'age']  # 列
)
df

Unnamed: 0,name,score,age
1,Jeremy,100.0,28
2,Sean,40.5,30
3,Jason,10.7,10
4,Amanda,80.4,15
5,Taylor,90.2,30


### 2.2 DataFrame的属性

In [73]:
print(df)
print('行索引:', df.index)
print('列标签:', df.columns)
print('值:', df.values)

     name  score  age
1  Jeremy  100.0   28
2    Sean   40.5   30
3   Jason   10.7   10
4  Amanda   80.4   15
5  Taylor   90.2   30
行索引: Int64Index([1, 2, 3, 4, 5], dtype='int64')
列标签: Index(['name', 'score', 'age'], dtype='object')
值: [['Jeremy' 100.0 28]
 ['Sean' 40.5 30]
 ['Jason' 10.7 10]
 ['Amanda' 80.4 15]
 ['Taylor' 90.2 30]]


In [74]:
print('维度:', df.ndim)
print('形状:', df.shape)
print('元素个数:', df.size)
print('元素类型:', df.dtypes)

维度: 2
形状: (5, 3)
元素个数: 15
元素类型: name      object
score    float64
age        int64
dtype: object


In [75]:
df.T  # 行列转置
print(df.T.index)
print(df.T.columns)
print(df.T.values)

Index(['name', 'score', 'age'], dtype='object')
Int64Index([1, 2, 3, 4, 5], dtype='int64')
[['Jeremy' 'Sean' 'Jason' 'Amanda' 'Taylor']
 [100.0 40.5 10.7 80.4 90.2]
 [28 30 10 15 30]]


### 2.3 DataFrame元素获取

In [76]:
print(df.loc[1])  # 行索引
print(df.iloc[0])  # 行隐式索引
print(df.loc[:, 'name'])  # 索引获取列
print(df.iloc[:, 0])  # 隐式索引获取列
print(df.at[1, 'name'])  # 获取单个元素
print(df.iat[0, 0])  # 隐式索引获取单个元素

name     Jeremy
score     100.0
age          28
Name: 1, dtype: object
name     Jeremy
score     100.0
age          28
Name: 1, dtype: object
1    Jeremy
2      Sean
3     Jason
4    Amanda
5    Taylor
Name: name, dtype: object
1    Jeremy
2      Sean
3     Jason
4    Amanda
5    Taylor
Name: name, dtype: object
Jeremy
Jeremy


In [77]:
print(df['name'], type(df['name']))  # 获取列数据Series
print(df.name, type(df.name))  # 获取列数据Series
df[['name', 'score']]  # 返回指定columns的DataFrame

1    Jeremy
2      Sean
3     Jason
4    Amanda
5    Taylor
Name: name, dtype: object <class 'pandas.core.series.Series'>
1    Jeremy
2      Sean
3     Jason
4    Amanda
5    Taylor
Name: name, dtype: object <class 'pandas.core.series.Series'>


Unnamed: 0,name,score
1,Jeremy,100.0
2,Sean,40.5
3,Jason,10.7
4,Amanda,80.4
5,Taylor,90.2


In [78]:
print(df.head(2))  # 前n行数据
print(df.tail(2))  # 后n行数据

     name  score  age
1  Jeremy  100.0   28
2    Sean   40.5   30
     name  score  age
4  Amanda   80.4   15
5  Taylor   90.2   30


In [79]:
print(df[df.score > 80])  # 布尔索引过滤数据
print(df[(df.score > 80) & (df.age < 30)])

     name  score  age
1  Jeremy  100.0   28
4  Amanda   80.4   15
5  Taylor   90.2   30
     name  score  age
1  Jeremy  100.0   28
4  Amanda   80.4   15


In [80]:
# 随机抽样n条数据
print(df.sample(3))

     name  score  age
5  Taylor   90.2   30
4  Amanda   80.4   15
3   Jason   10.7   10


In [81]:
df = pd.DataFrame(
    {
        'name': ['Jeremy', 'Sean', 'Sean', 'Jason', 'Amanda', 'Taylor'],
        'age': [28, 30, 30, 10, 15, 30],
        'score': [100.0, 40.5, 40.5, 10.7, 90.2, 90.2]
    },
    index=[i for i in range(1, 7)],  # 索引
    columns=['name', 'score', 'age']  # 列
)
df
print(df.isin(['Jeremy', 90.2]))  # 查看元素是否包含在参数集合中
print(df.isna())  # 查看元素是否为缺失值

    name  score    age
1   True  False  False
2  False  False  False
3  False  False  False
4  False  False  False
5  False   True  False
6  False   True  False
    name  score    age
1  False  False  False
2  False  False  False
3  False  False  False
4  False  False  False
5  False  False  False
6  False  False  False


In [88]:
print(df.score.sum())
print(df.score.max())
print(df.score.min())
print(df.score.mean())
print(df.score.median())
print(df.score.mode())
print(df.score.var())
print(df.score.std())

372.09999999999997
100.0
10.7
62.01666666666666
65.35
0    40.5
1    90.2
dtype: float64
1318.1336666666668
36.306110596794404


In [90]:
df.describe()  # 数值型列描述性信息

Unnamed: 0,score,age
count,6.0,6.0
mean,62.016667,23.833333
std,36.306111,8.953584
min,10.7,10.0
25%,40.5,18.25
50%,65.35,29.0
75%,90.2,30.0
max,100.0,30.0


In [95]:
print(df.count())  # 每列非缺失值元素个数
print(df.value_counts())  # 每行出现个数
print(df.drop_duplicates())  # 去重行元素
print(df.duplicated())  # 判断每行是否重复

name     6
score    6
age      6
dtype: int64
name    score  age
Sean    40.5   30     2
Amanda  90.2   15     1
Jason   10.7   10     1
Jeremy  100.0  28     1
Taylor  90.2   30     1
dtype: int64
     name  score  age
1  Jeremy  100.0   28
2    Sean   40.5   30
4   Jason   10.7   10
5  Amanda   90.2   15
6  Taylor   90.2   30
1    False
2    False
3     True
4    False
5    False
6    False
dtype: bool


In [101]:
print(df.cumsum())
print(df.cummax(axis=0))  # 按列累积最大值

                              name  score  age
1                           Jeremy  100.0   28
2                       JeremySean  140.5   58
3                   JeremySeanSean  181.0   88
4              JeremySeanSeanJason  191.7   98
5        JeremySeanSeanJasonAmanda  281.9  113
6  JeremySeanSeanJasonAmandaTaylor  372.1  143
     name  score  age
1  Jeremy  100.0   28
2    Sean  100.0   30
3    Sean  100.0   30
4    Sean  100.0   30
5    Sean  100.0   30
6  Taylor  100.0   30


In [103]:
print(df.sort_index(ascending=False))  # 按行索引排序
print(df.sort_values(by=['score', 'age'], ascending=[True, False]))  # 按指定列排序

     name  score  age
6  Taylor   90.2   30
5  Amanda   90.2   15
4   Jason   10.7   10
3    Sean   40.5   30
2    Sean   40.5   30
1  Jeremy  100.0   28
     name  score  age
4   Jason   10.7   10
2    Sean   40.5   30
3    Sean   40.5   30
6  Taylor   90.2   30
5  Amanda   90.2   15
1  Jeremy  100.0   28


In [107]:
print(df.nlargest(2, columns=['score', 'age']))
print(df.nsmallest(2, columns=['score', 'age']))

     name  score  age
1  Jeremy  100.0   28
6  Taylor   90.2   30
    name  score  age
4  Jason   10.7   10
2   Sean   40.5   30


### 2.4 DataFrame案例

In [126]:
"""
已知班级学生成绩. 计算每位学生的总分和平均分、找出数学成绩高于90分或英语成绩
高于85分的学生、按总分从高到低排序，并输出前三名学生
"""
data = {
    'name': ['z3', 'l4', 'w5', 'z6', 'q7'],
    'math': [85, 92, 78, 88, 95],
    'english': [90, 88, 85, 92, 80],
    'physics': [75, 80, 88, 85, 90]
}
scores = pd.DataFrame(data)
print(scores)
scores['total'] = scores[['math', 'english', 'physics']].sum(axis=1)
scores['mean'] = scores[['math', 'english', 'physics']].mean(axis=1)
print(scores)
print(scores[(scores.math > 90) | (scores.english > 85)])
# print(scores.sort_values('total', ascending=False)[['name', 'total']].head(3))
print(scores.nlargest(3, columns=['total'])[['name', 'total']])

  name  math  english  physics
0   z3    85       90       75
1   l4    92       88       80
2   w5    78       85       88
3   z6    88       92       85
4   q7    95       80       90
  name  math  english  physics  total       mean
0   z3    85       90       75    250  83.333333
1   l4    92       88       80    260  86.666667
2   w5    78       85       88    251  83.666667
3   z6    88       92       85    265  88.333333
4   q7    95       80       90    265  88.333333
  name  math  english  physics  total       mean
0   z3    85       90       75    250  83.333333
1   l4    92       88       80    260  86.666667
3   z6    88       92       85    265  88.333333
4   q7    95       80       90    265  88.333333
  name  total
3   z6    265
4   q7    265
1   l4    260


In [132]:
"""
已知公司销售数据. 计算每种产品的总销售额(销售额 = 单价 * 销量)、
找出销售额最高的产品、按销售额从高到低排序，并输出所有产品信息
"""
data = {
    'p_name': ['A', 'B', 'C', 'D'],
    'price': [100, 150, 200, 120],
    's_num': [50, 30, 20, 40]
}
df = pd.DataFrame(data)
print(df)
df['total'] = df['price'] * df['s_num']
print(df)
print(df.nlargest(1, columns=['total']))
print(df.sort_values(by=['total'], ascending=False))

  p_name  price  s_num
0      A    100     50
1      B    150     30
2      C    200     20
3      D    120     40
  p_name  price  s_num  total
0      A    100     50   5000
1      B    150     30   4500
2      C    200     20   4000
3      D    120     40   4800
  p_name  price  s_num  total
0      A    100     50   5000
  p_name  price  s_num  total
0      A    100     50   5000
3      D    120     40   4800
1      B    150     30   4500
2      C    200     20   4000


In [140]:
"""
已知电商平台用户行为数据. 计算每位用户总消费金额(消费金额 = 商品单价 * 购买数量)、
找出消费金额最高的用户，并输出其所有信息、计算所有用户的平均消费金额(保留两位小数)、
统计电子产品的总购买数量
"""
data = {
    'uid': [101, 102, 103, 104, 105],
    'uname': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'category': ['e_device', 'dress', 'e_device', 'furniture', 'dress'],
    'price': [1200, 300, 800, 150, 200],
    'num': [1, 3, 2, 5, 4]
}
df = pd.DataFrame(data)
print(df)
df['total'] = df['price'] * df['num']
print(df)
df.nlargest(1, columns=['total'])
print(df.total.mean())
print(df[df['category'] == 'e_device'].num.sum())

   uid    uname   category  price  num
0  101    Alice   e_device   1200    1
1  102      Bob      dress    300    3
2  103  Charlie   e_device    800    2
3  104    David  furniture    150    5
4  105      Eve      dress    200    4
   uid    uname   category  price  num  total
0  101    Alice   e_device   1200    1   1200
1  102      Bob      dress    300    3    900
2  103  Charlie   e_device    800    2   1600
3  104    David  furniture    150    5    750
4  105      Eve      dress    200    4    800
1050.0
3
