In [1]:
import pandas as pd

# Series 详解

## 创建 Series

In [2]:
s = pd.Series(['banana', 42])
print(s)
print(type(s))

0    banana
1        42
dtype: object
<class 'pandas.core.series.Series'>


In [3]:
s = pd.Series(['banana', 'apple'])
print(s)
print(type(s))

0    banana
1     apple
dtype: object
<class 'pandas.core.series.Series'>


In [4]:
s = pd.Series([50, 42])
print(s)
print(type(s))

0    50
1    42
dtype: int64
<class 'pandas.core.series.Series'>


In [5]:
s = pd.Series(['smart', 18], index=['name', 'age'])
print(s)
print(type(s))

name    smart
age        18
dtype: object
<class 'pandas.core.series.Series'>


## Series 常用操作

In [6]:
scientists = pd.read_csv('./data/scientists.csv')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920/7/25,1958/4/16,37,Chemist
1,William Gosset,1876-06-13,1937/10/16,61,Statistician
2,Florence Nightingale,1820-05-12,1910/8/13,90,Nurse
3,Marie Curie,1867-11-07,1934/7/4,66,Chemist
4,Rachel Carson,1907/5/27,1964/4/14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,,1954/6/7,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


##### 常用属性和方法

In [7]:
# 并获取 Age 列的数据
age_series = scientists['Age']
print(age_series)
print(type(age_series))

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


In [9]:
age_series.shape

(8,)

In [10]:
age_series.size

8

In [11]:
age_series.index

RangeIndex(start=0, stop=8, step=1)

In [12]:
age_series.values

array([37, 61, 90, 66, 56, 45, 41, 77])

In [13]:
age_series.keys()

RangeIndex(start=0, stop=8, step=1)

In [14]:
age_series.loc[1]

61

In [15]:
age_series.iloc[1]

61

In [16]:
age_series.dtypes

dtype('int64')

##### 常用统计方法

In [17]:
# 计算年龄的平均值
age_series.mean()

59.125

In [18]:
# 计算年龄的最大值
age_series.max()

90

In [19]:
# 计算年龄的最小值
age_series.min()

37

In [20]:
# 计算年龄的标准差
age_series.std()

18.325918413937288

In [21]:
# 获取职业这一列数据
occupation_series = scientists['Occupation']
print(occupation_series)
occupation_series.value_counts()

0               Chemist
1          Statistician
2                 Nurse
3               Chemist
4             Biologist
5             Physician
6    Computer Scientist
7         Mathematician
Name: Occupation, dtype: object


Chemist               2
Physician             1
Biologist             1
Nurse                 1
Computer Scientist    1
Mathematician         1
Statistician          1
Name: Occupation, dtype: int64

In [22]:
# 统计 Born 这一列非空元素的个数
scientists['Born'].count()

7

In [23]:
# age_series 是数值型数据
age_series.describe()

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64

In [24]:
# occupation_series 是非数值型数据
occupation_series.describe()

count           8
unique          7
top       Chemist
freq            2
Name: Occupation, dtype: object

## bool 索引

In [25]:
age_series

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [26]:
bool_values = [False, True, True, True, False, False, False, True]
age_series[bool_values]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [27]:
# 应用：从 age_series 中删选出年龄大于平均值的数据
age_series[age_series>age_series.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [28]:
age_series > age_series.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

## Series 运算

##### Series 和 数值型数据运算

In [29]:
age_series

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [30]:
age_series + 100

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

In [31]:
age_series * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

##### Series 和 另一 Series 运算

In [32]:
# 加法
age_series + age_series

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [33]:
# 乘法
age_series * age_series

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64

In [34]:
new_series = pd.Series([1, 100])

In [35]:
new_series

0      1
1    100
dtype: int64

In [36]:
# 两个 Series 相加
age_series + new_series

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64

In [37]:
# 两个 Series 相乘
age_series * new_series

0      37.0
1    6100.0
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
dtype: float64

# DataFrame 详解

## 创建 DataFrame

##### 利用字典创建 DataFrame

In [38]:
peoples = pd.DataFrame({
    'Name': ['Smart', 'David'],
    'Occupation': ['Teacher', 'IT Engineer'],
    'Age': [18, 30]
})
peoples

Unnamed: 0,Name,Occupation,Age
0,Smart,Teacher,18
1,David,IT Engineer,30


In [39]:
peoples = pd.DataFrame({
    'Occupation': ['Teacher', 'IT Engineer'],
    'Age': [18, 30]
}, columns=['Age', 'Occupation'], index=['Smart', 'David'])
peoples

Unnamed: 0,Age,Occupation
Smart,18,Teacher
David,30,IT Engineer


##### 利用嵌套列表创建 DataFrame

In [40]:
peoples = pd.DataFrame([
    ['Teacher', 18],
    ['IT Engineer', 30]
], columns=['Occupation', 'Age'], index=['Smart', 'David'])
peoples

Unnamed: 0,Occupation,Age
Smart,Teacher,18
David,IT Engineer,30


## DataFrame 常用操作

##### 常用属性和方法

In [41]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920/7/25,1958/4/16,37,Chemist
1,William Gosset,1876-06-13,1937/10/16,61,Statistician
2,Florence Nightingale,1820-05-12,1910/8/13,90,Nurse
3,Marie Curie,1867-11-07,1934/7/4,66,Chemist
4,Rachel Carson,1907/5/27,1964/4/14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,,1954/6/7,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [42]:
type(scientists)

pandas.core.frame.DataFrame

In [43]:
scientists.shape

(8, 5)

In [44]:
scientists.size

40

In [45]:
scientists.ndim

2

In [46]:
len(scientists)

8

In [47]:
scientists.index

RangeIndex(start=0, stop=8, step=1)

In [48]:
scientists.columns

Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')

In [49]:
scientists.dtypes

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object

In [50]:
scientists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        8 non-null      object
 1   Born        7 non-null      object
 2   Died        8 non-null      object
 3   Age         8 non-null      int64 
 4   Occupation  8 non-null      object
dtypes: int64(1), object(4)
memory usage: 448.0+ bytes


In [51]:
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920/7/25,1958/4/16,37,Chemist
1,William Gosset,1876-06-13,1937/10/16,61,Statistician
2,Florence Nightingale,1820-05-12,1910/8/13,90,Nurse
3,Marie Curie,1867-11-07,1934/7/4,66,Chemist
4,Rachel Carson,1907/5/27,1964/4/14,56,Biologist


In [52]:
scientists.tail()

Unnamed: 0,Name,Born,Died,Age,Occupation
3,Marie Curie,1867-11-07,1934/7/4,66,Chemist
4,Rachel Carson,1907/5/27,1964/4/14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,,1954/6/7,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


##### 常用统计方法

In [53]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920/7/25,1958/4/16,37,Chemist
1,William Gosset,1876-06-13,1937/10/16,61,Statistician
2,Florence Nightingale,1820-05-12,1910/8/13,90,Nurse
3,Marie Curie,1867-11-07,1934/7/4,66,Chemist
4,Rachel Carson,1907/5/27,1964/4/14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,,1954/6/7,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [54]:
scientists.max()

Name          William Gosset
Died               1964/4/14
Age                       90
Occupation      Statistician
dtype: object

In [55]:
scientists.min()

Name          Alan Turing
Died           1855-02-23
Age                    37
Occupation      Biologist
dtype: object

In [56]:
scientists.count()

Name          8
Born          7
Died          8
Age           8
Occupation    8
dtype: int64

In [57]:
scientists.size

40

In [58]:
scientists.describe()

Unnamed: 0,Age
count,8.0
mean,59.125
std,18.325918
min,37.0
25%,44.0
50%,58.5
75%,68.75
max,90.0


In [59]:
import numpy as np
scientists.describe(include=[np.object_])

Unnamed: 0,Name,Born,Died,Occupation
count,8,7,8,8
unique,8,7,8,7
top,Johann Gauss,1867-11-07,1958/4/16,Chemist
freq,1,1,1,2


## bool 索引

##### 应用：获取 scientists 中 Age 大于平均值的科学家信息

## DataFrame 运算

##### DataFrame 和 数值型数据运算

In [60]:
# DataFrame 和 数值型数据运算
scientists * 2

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920/7/251920/7/25,1958/4/161958/4/16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937/10/161937/10/16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910/8/131910/8/13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934/7/41934/7/4,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907/5/271907/5/27,1964/4/141964/4/14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,,1954/6/71954/6/7,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


##### DataFrame 和 另一 DataFrame 运算

In [61]:
# DataFrame 和 另一 DataFrame 运算
scientists + scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920/7/251920/7/25,1958/4/161958/4/16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937/10/161937/10/16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910/8/131910/8/13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934/7/41934/7/4,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907/5/271907/5/27,1964/4/141964/4/14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,,1954/6/71954/6/7,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


In [62]:
# DataFrame 和 另一 DataFrame 运算
scientists + scientists[:4]

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920/7/251920/7/25,1958/4/161958/4/16,74.0,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937/10/161937/10/16,122.0,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910/8/131910/8/13,180.0,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934/7/41934/7/4,132.0,ChemistChemist
4,,,,,
5,,,,,
6,,,,,
7,,,,,


## 行标签和列表签操作

### 加载数据后，指定某列数据作为 DataFrame 行标签

In [63]:
scientists = pd.read_csv('./data/scientists.csv')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920/7/25,1958/4/16,37,Chemist
1,William Gosset,1876-06-13,1937/10/16,61,Statistician
2,Florence Nightingale,1820-05-12,1910/8/13,90,Nurse
3,Marie Curie,1867-11-07,1934/7/4,66,Chemist
4,Rachel Carson,1907/5/27,1964/4/14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,,1954/6/7,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [66]:
# 设置 Name 列的值作为行标签, 不改动原来的数据格式
scientists_df = scientists.set_index('Name')
scientists_df

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920/7/25,1958/4/16,37,Chemist
William Gosset,1876-06-13,1937/10/16,61,Statistician
Florence Nightingale,1820-05-12,1910/8/13,90,Nurse
Marie Curie,1867-11-07,1934/7/4,66,Chemist
Rachel Carson,1907/5/27,1964/4/14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,,1954/6/7,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [65]:
# 注意：reset_index返回的是一个新的 DataFrame
scientists_df.reset_index()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920/7/25,1958/4/16,37,Chemist
1,William Gosset,1876-06-13,1937/10/16,61,Statistician
2,Florence Nightingale,1820-05-12,1910/8/13,90,Nurse
3,Marie Curie,1867-11-07,1934/7/4,66,Chemist
4,Rachel Carson,1907/5/27,1964/4/14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,,1954/6/7,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 加载数据时，指定某列数据作为 DataFrame 行标签

### 加载数据后，修改 DataFrame 行标签和列标签

### 加载数据后，重新索引 DataFrame 数据