In [1]:
import pandas as pd

# Series 详解

## 创建 Series

In [2]:
s = pd.Series(['banana', 42])
print(s)
print(type(s))

0    banana
1        42
dtype: object
<class 'pandas.core.series.Series'>


In [3]:
s = pd.Series(['banana', 'apple'])
print(s)
print(type(s))

0    banana
1     apple
dtype: object
<class 'pandas.core.series.Series'>


In [4]:
s = pd.Series([50, 42])
print(s)
print(type(s))

0    50
1    42
dtype: int64
<class 'pandas.core.series.Series'>


In [5]:
s = pd.Series(['smart', 18], index=['name', 'age'])
print(s)
print(type(s))

name    smart
age        18
dtype: object
<class 'pandas.core.series.Series'>


## Series 常用操作

In [6]:
scientists = pd.read_csv('./data/scientists.csv')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920/7/25,1958/4/16,37,Chemist
1,William Gosset,1876-06-13,1937/10/16,61,Statistician
2,Florence Nightingale,1820-05-12,1910/8/13,90,Nurse
3,Marie Curie,1867-11-07,1934/7/4,66,Chemist
4,Rachel Carson,1907/5/27,1964/4/14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,,1954/6/7,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


##### 常用属性和方法

In [7]:
# 并获取 Age 列的数据
age_series = scientists['Age']
print(age_series)
print(type(age_series))

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


In [9]:
age_series.shape

(8,)

In [10]:
age_series.size

8

In [11]:
age_series.index

RangeIndex(start=0, stop=8, step=1)

In [12]:
age_series.values

array([37, 61, 90, 66, 56, 45, 41, 77])

In [13]:
age_series.keys()

RangeIndex(start=0, stop=8, step=1)

In [14]:
age_series.loc[1]

61

In [15]:
age_series.iloc[1]

61

In [16]:
age_series.dtypes

dtype('int64')

##### 常用统计方法

In [17]:
# 计算年龄的平均值
age_series.mean()

59.125

In [18]:
# 计算年龄的最大值
age_series.max()

90

In [19]:
# 计算年龄的最小值
age_series.min()

37

In [20]:
# 计算年龄的标准差
age_series.std()

18.325918413937288

In [21]:
# 获取职业这一列数据
occupation_series = scientists['Occupation']
print(occupation_series)
occupation_series.value_counts()

0               Chemist
1          Statistician
2                 Nurse
3               Chemist
4             Biologist
5             Physician
6    Computer Scientist
7         Mathematician
Name: Occupation, dtype: object


Chemist               2
Physician             1
Biologist             1
Nurse                 1
Computer Scientist    1
Mathematician         1
Statistician          1
Name: Occupation, dtype: int64

In [22]:
# 统计 Born 这一列非空元素的个数
scientists['Born'].count()

7

In [23]:
# age_series 是数值型数据
age_series.describe()

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64

In [24]:
# occupation_series 是非数值型数据
occupation_series.describe()

count           8
unique          7
top       Chemist
freq            2
Name: Occupation, dtype: object

## bool 索引

In [25]:
age_series

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [26]:
bool_values = [False, True, True, True, False, False, False, True]
age_series[bool_values]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [27]:
# 应用：从 age_series 中删选出年龄大于平均值的数据
age_series[age_series>age_series.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [28]:
age_series > age_series.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

## Series 运算

##### Series 和 数值型数据运算

In [29]:
age_series

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [30]:
age_series + 100

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

In [31]:
age_series * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

##### Series 和 另一 Series 运算

In [32]:
# 加法
age_series + age_series

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [33]:
# 乘法
age_series * age_series

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64

In [34]:
new_series = pd.Series([1, 100])

In [35]:
new_series

0      1
1    100
dtype: int64

In [36]:
# 两个 Series 相加
age_series + new_series

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64

# DataFrame 详解

## 创建 DataFrame

##### 利用字典创建 DataFrame

##### 利用嵌套列表创建 DataFrame

## DataFrame 常用操作

##### 常用属性和方法

##### 常用统计方法

## bool 索引

##### 应用：获取 scientists 中 Age 大于平均值的科学家信息

## DataFrame 运算

##### DataFrame 和 数值型数据运算

##### DataFrame 和 另一 DataFrame 运算

## 行标签和列表签操作

### 加载数据后，指定某列数据作为 DataFrame 行标签

### 加载数据时，指定某列数据作为 DataFrame 行标签

### 加载数据后，修改 DataFrame 行标签和列标签

### 加载数据后，重新索引 DataFrame 数据