# DataFrame

## 1. 基本操作

In [None]:
# 构造方法
import pandas as pd

# Series ctor : 字典+Series， {列1：Series_of_列1}
l_id = pd.Series(["01", "02", "03", "04", "05"], index=['xiaoming', 'xiaohong', 'xiaojie', 'xiaohua', 'xiaoli'])
l_class =  pd.Series(["c2", "c1", "c2", "c3", "c1"], index=['xiaoming', 'xiaohong', 'xiaojie', 'xiaohua', 'xiaoli'])
l_grade =  pd.Series([92, 67, 70, 88, 76], index=['xiaoming', 'xiaohong', 'xiaojie', 'xiaohua', 'xiaoli'])

df = pd.DataFrame({'id':l_id, 'class':l_class, 'grade':l_grade})
df

# nested dict {列名1：{行名1：值}，列名2：{行名2：值}}
df2 = pd.DataFrame({'id': {'xiaoming':1, 'xiaohong':2},
                    'class': {'xiaoming':2, 'xiaohong':1}}) 
df2

Unnamed: 0,id,class
xiaoming,1,2
xiaohong,2,1


In [None]:
# index, columns, values, T
df.index  # index 为属性, 不是函数,因此没有.index()
df.columns
df.values
df.T # 转置
df.head(10)

Unnamed: 0,id,class,grade
xiaoming,1,c2,92
xiaohong,2,c1,67
xiaojie,3,c2,70
xiaohua,4,c3,88
xiaoli,5,c1,76


In [100]:
# 提取列
df['id']
df.id
df[['id', 'grade']] # 多列，
# 切片对象只能放在 .loc / .iloc 或者 Series/单层 [] 中使用。

Unnamed: 0,id,grade
xiaoming,1,92
xiaohong,2,67
xiaojie,3,70
xiaohua,4,88
xiaoli,5,76


In [None]:
# 提取行
df.loc['xiaoming']
df.iloc[3]

# 提取多行
df.loc[['xiaoming', 'xiaoli']] 
df.iloc[[3, 1]]

# slice
df.loc['xiaoming':'xiaoli']
df.iloc[1:3]

# 提取某块值 iloc or loc
df.iloc[2, 0] # 某个值: 行，列
df.iloc[1:3, 1:3] 
df.iloc[1, :]  # 某行全列
print(df.iloc[[1,3], 0:2])

df.loc[:, 'id':'class']

          id class
xiaohong  02    c1
xiaohua   04    c3


Unnamed: 0,id,class
xiaoming,1,c2
xiaohong,2,c1
xiaojie,3,c2
xiaohua,4,c3
xiaoli,5,c1


In [26]:
df

Unnamed: 0,id,class,grade
xiaoming,1,c2,92
xiaohong,2,c1,67
xiaojie,3,c2,70
xiaohua,4,c3,88
xiaoli,5,c1,76


In [62]:
# 筛选符合条件的行 : df[df[列名] > 40]
df[df['grade'] > 40]  

Unnamed: 0,id,class,grade
xiaoming,1,c2,92
xiaohong,2,c1,67
xiaojie,3,c2,70
xiaohua,4,c3,88
xiaoli,5,c1,76


In [None]:
df["city"].unique() #returns the unique values in city
df['city'].value_counts() #counts the records for each city
df['category_0'].nunique() #counts the non-null unique values in category_0

str 方法: `.str.cat`, `.str.contains`, `.isin`

In [None]:
# Add a new “categories” column that combines “category_0” and “category_1” as a commaseparated list
df["categories"] = df["category_0"].str.cat(df["category_1"], sep=',’) 
#concatenates the string value of category_0 with category_1, separated by a ‘,’

# Now we can look up businesses based on the single “categories” column! 
df[df["categories"].str.contains("Pizza")]


bar_rest = df["category_0"].isin(["Bars", "Restaurants"])

### Excersice

In [31]:
import pandas as pd 
lst = ["001", "002", "003", "004", "005", '006']
name = pd.Series(['小陈', '小李', "小王", "小张", "小赵", "小周"], index=["001", "002", "003", "004", "005", '006'])
gender = pd.Series(['女', '女', "男", "男", "女", "男"], index=lst[::-1])
height = pd.Series([172.5, 168, 178.2, 181.3, 161.7], index=lst[:-1])

In [39]:
students = pd.DataFrame({"姓名":name, "性别":gender, "身高":height})
print(students.index, students.values)
students.T

Index(['001', '002', '003', '004', '005', '006'], dtype='object') [['小陈' '男' 172.5]
 ['小李' '女' 168.0]
 ['小王' '男' 178.2]
 ['小张' '男' 181.3]
 ['小赵' '女' 161.7]
 ['小周' '女' nan]]


Unnamed: 0,001,002,003,004,005,006
姓名,小陈,小李,小王,小张,小赵,小周
性别,男,女,男,男,女,女
身高,172.5,168.0,178.2,181.3,161.7,


In [45]:
students

Unnamed: 0,姓名,性别,身高
1,小陈,男,172.5
2,小李,女,168.0
3,小王,男,178.2
4,小张,男,181.3
5,小赵,女,161.7
6,小周,女,


In [59]:
students.loc[['003', '005']]

Unnamed: 0,姓名,性别,身高
3,小王,男,178.2
5,小赵,女,161.7


In [56]:
students.身高
students[['身高', '性别']]
students.loc["003":"005"]
students.loc["005", '身高']
students.loc[["003", "005"], ["姓名", '身高']]


Unnamed: 0,姓名,身高
3,小王,178.2
5,小赵,161.7


In [60]:
students[(students['身高'] > 165) & (students['性别'] == "女") ]

Unnamed: 0,姓名,性别,身高
2,小李,女,168.0


In [61]:
students.head(5)

Unnamed: 0,姓名,性别,身高
1,小陈,男,172.5
2,小李,女,168.0
3,小王,男,178.2
4,小张,男,181.3
5,小赵,女,161.7


## 2. 更多操作  
替换，更新，删除

In [None]:
# rename
# 只改几个列名，使用 dict 映射旧 → 新
df = df.rename(columns={'旧列名1': '新列名1',
                        '旧列名2': '新列名2'})
# -------------------------------------------------
# 若想原地修改
df.rename(columns={'旧列名1': '新列名1'}, inplace=True)

# -------------------------------------------------
# 也可以传入函数，对所有列名统一处理
df = df.rename(columns=lambda x: x.strip().lower().replace(' ', '_'))


# 生成映射字典
mapping = dict(zip(['A','B','C'], ['a','b','c']))   

In [63]:
students

Unnamed: 0,姓名,性别,身高
1,小陈,男,172.5
2,小李,女,168.0
3,小王,男,178.2
4,小张,男,181.3
5,小赵,女,161.7
6,小周,女,


In [71]:
'''
更新列
'''
# 替换
students["身高"] = [0,0,0,0,0,0] # 赋值列表则为默认顺序
# 或者
students["身高"] = pd.Series([1,0,0,0,0,1], index=["001", "002", "003", "004", "005", '006'])

# 新增列
students["班级"] = [1,2,3,4,2,1]

Unnamed: 0,姓名,性别,身高,班级
1,小陈,男,1,1
2,小李,女,0,2
3,小王,男,0,3
4,小张,男,0,4
5,小赵,女,0,2
6,小周,女,1,1


In [73]:
'''
更新列
'''
# list按顺序
students.loc['005'] = ["小赵", '女', 162.7, '1'] 
# or
students.loc['005'] =pd.Series(["小赵", '女', 162.7, '1'] , index=['姓名', '性别', '身高', '班级'])

# 添加行只能用loc

Unnamed: 0,姓名,性别,身高,班级
1,小陈,男,1.0,1
2,小李,女,0.0,2
3,小王,男,0.0,3
4,小张,男,0.0,4
5,小赵,女,162.7,1
6,小周,女,1.0,1


In [None]:
# 删除列，行(不改变原df)
new_students = students.drop('003') # 默认删除单行
new_students = students.drop(['身高', '性别'], axis=1) # 删除多列

### df之间操作  
列，行对齐操作


In [None]:
df.sub(df2, fill_value=0)
df.div(df2, fill_value=0)
df.mul(df2, fill_value=0)

# df and Series：series索引与df列名操作

### Exercise

In [140]:
students_data = pd.DataFrame({'001': {"姓名":'小陈', '考试1':85, '考试2':95, '考试3':92}, '002': {"姓名":'小李', '考试1':91, '考试2':92, '考试3':94}, '003': {"姓名":'小王', '考试1':86, '考试2':81, '考试3':89}, '004': {"姓名":'小张', '考试1':79, '考试2':89, '考试3':95}, '005': {"姓名":'小赵', '考试1':96, '考试2':91, '考试3':91}, '006': {"姓名":'小周', '考试1':81, '考试2':89, '考试3':92} } )
students = pd.DataFrame(students_data).T
students

Unnamed: 0,姓名,考试1,考试2,考试3
1,小陈,85,95,92
2,小李,91,92,94
3,小王,86,81,89
4,小张,79,89,95
5,小赵,96,91,91
6,小周,81,89,92


In [141]:
# add a column
students['考试4'] = [72, 69, 79, 83, 82, 76]
students.loc['007'] = ['小杨', 79, 82, 81, 69]
students

Unnamed: 0,姓名,考试1,考试2,考试3,考试4
1,小陈,85,95,92,72
2,小李,91,92,94,69
3,小王,86,81,89,79
4,小张,79,89,95,83
5,小赵,96,91,91,82
6,小周,81,89,92,76
7,小杨,79,82,81,69


In [133]:
students_new = students.drop(['006', '007'])
students_new2 = students.drop(['考试2', '考试2'], axis = 1)
students_new2

Unnamed: 0,姓名,考试1,考试3,考试4
1,小陈,85,92,72
2,小李,91,94,69
3,小王,86,89,79
4,小张,79,95,83
5,小赵,96,91,82
6,小周,81,92,76
7,小杨,79,81,69


In [134]:
bonus = pd.Series({'考试1':2, '考试2':3, '考试3':2, "考试4": 5})
new_students = students[['考试1','考试2', '考试3', '考试4']] + bonus
new_students

Unnamed: 0,考试1,考试2,考试3,考试4
1,87,98,94,77
2,93,95,96,74
3,88,84,91,84
4,81,92,97,88
5,98,94,93,87
6,83,92,94,81
7,81,85,83,74


In [142]:
students['考试4'] += 10
students

Unnamed: 0,姓名,考试1,考试2,考试3,考试4
1,小陈,85,95,92,82
2,小李,91,92,94,79
3,小王,86,81,89,89
4,小张,79,89,95,93
5,小赵,96,91,91,92
6,小周,81,89,92,86
7,小杨,79,82,81,79


## 3. apply, applymap, describe, max, mean
- 默认axis = 0  
    axis 0 means alone vertical  
    axis 1 means horizontal  
- apply(func): func处理每列
- applymap(func): func处理每个元素
- **列类型转换:** astype('type')


In [143]:
students.mean() # 列均值

  students.mean()


考试1    85.285714
考试2    88.428571
考试3    90.571429
考试4    85.714286
dtype: float64

## 4. tip

以列表插入一条数据

one_piece = pd.DataFrame([5600, 4, 2, 2, 'no', 'no', 'yes', 'yes', 'no', 2, 'yes', 'semi-furnished'], df.drop('price', axis=1).iloc[1].index).T
one_piece

### Exercise

In [1]:
students_data = pd.DataFrame({'001': {"姓名":'小陈', '考试1':85, '考试2':95, '考试3':92}, '002': {"姓名":'小李', '考试1':91, '考试2':92, '考试3':94}, '003': {"姓名":'小王', '考试1':86, '考试2':81, '考试3':89}, '004': {"姓名":'小张', '考试1':79, '考试2':89, '考试3':95}, '005': {"姓名":'小赵', '考试1':96, '考试2':91, '考试3':91}, '006': {"姓名":'小周', '考试1':81, '考试2':89, '考试3':92} } )
students = pd.DataFrame(students_data).T

NameError: name 'pd' is not defined

In [None]:
students

Unnamed: 0,姓名,考试1,考试2,考试3
1,小陈,85,95,92
2,小李,91,92,94
3,小王,86,81,89
4,小张,79,89,95
5,小赵,96,91,91
6,小周,81,89,92


In [None]:
# 返回每个学生平均值
avg_s = students.loc[:, '考试1':'考试3'].mean(axis=1)
name_s = students['姓名']
students_avg = pd.DataFrame({"姓名": name, "平均分":avg_s})
students_avg

Unnamed: 0,姓名,平均分
1,小陈,90.666667
2,小李,92.333333
3,小王,85.333333
4,小张,87.666667
5,小赵,92.666667
6,小周,87.333333


In [None]:
# apply lambda: 输出第二高组成的Series
import numpy as np # np.sort
second_scores = students.loc[:, '考试1':'考试3'].apply(lambda x:np.sort(x)[-2])
second_scores

考试1    91
考试2    92
考试3    94
dtype: int64

In [None]:
def letter_grade(score):        
    if score >= 95:
        return 'A'
    elif score >= 90:
        return 'B'
    return 'C'

letter_df = students.loc[:, '考试1':'考试3'].applymap(letter_grade)
letter_df

Unnamed: 0,考试1,考试2,考试3
1,C,A,B
2,B,B,B
3,C,C,C
4,C,C,A
5,A,B,B
6,C,C,B


In [None]:
students.describe()
students['考试1']

001    85
002    91
003    86
004    79
005    96
006    81
Name: 考试1, dtype: object

In [None]:
students['考试1'] = students['考试1'].astype('int')
students['考试2'] = students['考试2'].astype('int')
students['考试3'] = students['考试3'].astype('int')
students.describe()

Unnamed: 0,考试1,考试2,考试3
count,6.0,6.0,6.0
mean,86.333333,89.5,92.166667
std,6.314006,4.722288,2.136976
min,79.0,81.0,89.0
25%,82.0,89.0,91.25
50%,85.5,90.0,92.0
75%,89.75,91.75,93.5
max,96.0,95.0,95.0
