# Python与数据科学

## 数据科学步骤

    - 数据收集：巧妇难为无米之炊；
    - 数据存储：何如高效的管理数据；
    - 数据清洗：garbage in, garbage out. 不可或缺，极其重要的步骤；
    - 数据建模：机器学习建模
    - 推论、预测及可视化

## Pandas

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("Data/Population.csv")

In [4]:
df

Unnamed: 0,指标,2015年,2014年,2013年,2012年,2011年,2010年,2009年,2008年,2007年,2006年
0,年末总人口(万人),137462,136782,136072,135404,134735,134091,133450,132802,132129,131448
1,男性人口(万人),70414,70079,69728,69395,69068,68748,68647,68357,68048,67728
2,女性人口(万人),67048,66703,66344,66009,65667,65343,64803,64445,64081,63720
3,城镇人口(万人),77116,74916,73111,71182,69079,66978,64512,62403,60633,58288
4,乡村人口(万人),60346,61866,62961,64222,65656,67113,68938,70399,71496,73160


## 资料清洗

### Series

In [13]:
phone = pd.Series([8000,5000,3000])
phone

0    8000
1    5000
2    3000
dtype: int64

In [15]:
# 设置索引index
phone = pd.Series([8000, 5000, 3000], index=["Iphone", "Huawei", "Oppo"])
phone

Iphone    8000
Huawei    5000
Oppo      3000
dtype: int64

### DataFrame
    类表格数据结构

In [17]:
# 列表初始化DataFrame
df = pd.DataFrame([
    ['frank', 'M', 29], 
    ['mary', 'F', 23], 
    ['tom', 'M', 35], 
    ['ted', 'M', 33], 
    ['jean', 'F', 21], 
    ['lisa', 'F', 20]
])

In [18]:
df

Unnamed: 0,0,1,2
0,frank,M,29
1,mary,F,23
2,tom,M,35
3,ted,M,33
4,jean,F,21
5,lisa,F,20


In [8]:
# 设置列名
df.columns = ["name", "gender", "age"]

In [9]:
df

Unnamed: 0,name,gender,age
0,frank,M,29
1,mary,F,23
2,tom,M,35
3,ted,M,33
4,jean,F,21
5,lisa,F,20


In [19]:
# 字典初始化DataFrame：键为列名
df = pd.DataFrame([
    {'name':'frank', 'gender':'M', 'age':29}, 
    {'name':'mary',  'gender':'F', 'age':23, 'employee':True}, 
    {'name':'tom',   'gender':'M', 'age':35},
    {'name':'ted',   'gender':'M', 'age':33}, 
    {'name':'jean',  'gender':'F', 'age':21}, 
    {'name':'lisa',  'gender':'F', 'age':20}
])

In [20]:
df
# NaN :not a number

Unnamed: 0,name,gender,age,employee
0,frank,M,29,
1,mary,F,23,True
2,tom,M,35,
3,ted,M,33,
4,jean,F,21,
5,lisa,F,20,


In [21]:
# 查看前3行数据，默认前5行
df.head(3)

Unnamed: 0,name,gender,age,employee
0,frank,M,29,
1,mary,F,23,True
2,tom,M,35,


In [22]:
# 查看末2行数据，默认末5行
df.tail(2)

Unnamed: 0,name,gender,age,employee
4,jean,F,21,
5,lisa,F,20,


In [23]:
# DataFrame数据的基本信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      6 non-null      object
 1   gender    6 non-null      object
 2   age       6 non-null      int64 
 3   employee  1 non-null      object
dtypes: int64(1), object(3)
memory usage: 320.0+ bytes


In [24]:
# 展示df的基本统计
df.describe()

Unnamed: 0,age
count,6.0
mean,26.833333
std,6.400521
min,20.0
25%,21.5
50%,26.0
75%,32.0
max,35.0


In [25]:
# df的数据信息
df.dtypes

name        object
gender      object
age          int64
employee    object
dtype: object

In [28]:
# 重命名索引名
# 索引值的个数要和行的个数相等
df.index = "a b c d e f".split()

In [29]:
df

Unnamed: 0,name,gender,age,employee
a,frank,M,29,
b,mary,F,23,True
c,tom,M,35,
d,ted,M,33,
e,jean,F,21,
f,lisa,F,20,


#### 存取元素与切割

##### 获取列

In [32]:
# df以类似字典的方式来获取某一列的值
df['age']

a    29
b    23
c    35
d    33
e    21
f    20
Name: age, dtype: int64

##### 获取行

In [34]:
# 获取行则可以使用 loc 及 iloc方法
# loc是根据index来索引。
# iloc则根据行号来索引，行号从0开始，逐次加1。
df

Unnamed: 0,name,gender,age,employee
a,frank,M,29,
b,mary,F,23,True
c,tom,M,35,
d,ted,M,33,
e,jean,F,21,
f,lisa,F,20,


In [35]:
df.loc['e']

name        jean
gender         F
age           21
employee     NaN
Name: e, dtype: object

In [36]:
df.iloc[2]

name        tom
gender        M
age          35
employee    NaN
Name: c, dtype: object

##### 利用loc 与 iloc 提取列数据

In [37]:
df.loc[:, 'age']

a    29
b    23
c    35
d    33
e    21
f    20
Name: age, dtype: int64

In [39]:
# 注意iloc是根据 integer索引，只能提供数字
df.iloc[:, 2]

a    29
b    23
c    35
d    33
e    21
f    20
Name: age, dtype: int64

##### 切割df

In [40]:
df[['name', 'age']]

Unnamed: 0,name,age
a,frank,29
b,mary,23
c,tom,35
d,ted,33
e,jean,21
f,lisa,20


In [45]:
df.loc[["a","c","e"],["name","age"]]

Unnamed: 0,name,age
a,frank,29
c,tom,35
e,jean,21


In [43]:
df

Unnamed: 0,name,gender,age,employee
a,frank,M,29,
b,mary,F,23,True
c,tom,M,35,
d,ted,M,33,
e,jean,F,21,
f,lisa,F,20,


In [44]:
df.iloc[::2, [0,2]]

Unnamed: 0,name,age
a,frank,29
c,tom,35
e,jean,21


##### 列比较

In [46]:
df['gender'] == 'M'

a     True
b    False
c     True
d     True
e    False
f    False
Name: gender, dtype: bool

In [47]:
# 取男性数据
df[df['gender'] == 'M']

Unnamed: 0,name,gender,age,employee
a,frank,M,29,
c,tom,M,35,
d,ted,M,33,


#### 使用 & 与 | 条件

In [50]:
# & 取交集
# 大于30岁的男性
df[ (df['gender']=='M') & (df['age']>30) ]

Unnamed: 0,name,gender,age,employee
c,tom,M,35,
d,ted,M,33,


In [52]:
# | 取并集
# 全部女性 及 小于30岁的男性
df[ (df['gender']=='F') | (df['age']<30) ]

Unnamed: 0,name,gender,age,employee
a,frank,M,29,
b,mary,F,23,True
c,tom,M,35,
d,ted,M,33,
e,jean,F,21,
f,lisa,F,20,


##### 删除与新增列

In [59]:
# 删除行或列
# drop返回删除后的df, 不是就地删除
# axis=0，表示以行索引删除，axis=1表示以列索引删除
df.drop('employee', axis=1)

Unnamed: 0,name,gender,age
a,frank,M,29
b,mary,F,23
c,tom,M,35
d,ted,M,33
e,jean,F,21
f,lisa,F,20


In [64]:
# 就地删除
del df['employee']
df

Unnamed: 0,name,gender,age
a,frank,M,29
b,mary,F,23
c,tom,M,35
d,ted,M,33
e,jean,F,21
f,lisa,F,20


In [65]:
# 新增列
df['employee'] = True
df

Unnamed: 0,name,gender,age,employee
a,frank,M,29,True
b,mary,F,23,True
c,tom,M,35,True
d,ted,M,33,True
e,jean,F,21,True
f,lisa,F,20,True


In [71]:
# 新增行
df.loc[1] = dict(name='Tom', age='12', gender='M', employee=False)
df

Unnamed: 0,name,gender,age,employee
a,frank,M,29,True
b,mary,F,23,True
c,tom,M,35,True
d,ted,M,33,True
e,jean,F,21,True
f,lisa,F,20,True
1,Tom,M,12,False


In [73]:
# 删除行 默认axis=0
df = df.drop(1)
df

Unnamed: 0,name,gender,age,employee
a,frank,M,29,True
b,mary,F,23,True
c,tom,M,35,True
d,ted,M,33,True
e,jean,F,21,True
f,lisa,F,20,True


### 缺失值
    - 数据中有特定或一个范围内的值是不完全的
    - 缺失值可能会导致数据分析时参数偏误的推论
    - 缺失值可能来自机械的缺失或人为的缺失
        - 机械缺失：机械故障，导致数据无法被完整保存
        - 人为缺失：受访者拒绝透漏部分信息

#### 检查DataFrame是否含有缺失值

In [5]:
import pandas as pd
import numpy as np
df = pd.DataFrame([
    ['frank', 'M', np.nan],
    ['mary', np.nan, np.nan],
    ['tom' , 'M', 35], 
    ['ted' , 'M', 33], 
    ['jean' , np.nan, 21], 
    ['lisa', 'F', 20]
], columns=["name", "gender", "age"])
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [11]:
df.isnull()
# df.isna()

Unnamed: 0,name,gender,age
0,False,False,True
1,False,True,True
2,False,False,False
3,False,False,False
4,False,True,False
5,False,False,False


In [7]:
df.notnull()

Unnamed: 0,name,gender,age
0,True,True,False
1,True,False,False
2,True,True,True
3,True,True,True
4,True,False,True
5,True,True,True


#### 检查序列是否包含缺失值

In [12]:
df['gender'].isnull()

0    False
1     True
2    False
3    False
4     True
5    False
Name: gender, dtype: bool

#### 检查字段是否有缺失值

In [18]:
df.name.isnull().values.any()

False

#### 检查DataFrame是否有缺失值

In [19]:
df.isnull().values.any()

True

#### 计算缺失值数量

In [20]:
df['gender']

0      M
1    NaN
2      M
3      M
4    NaN
5      F
Name: gender, dtype: object

In [21]:
# 计算某一列缺失值的数量
df['gender'].isnull().sum()

2

In [22]:
# 全列统计
df.isnull().sum()

name      0
gender    2
age       2
dtype: int64

In [24]:
# DataFrame 缺失值数量
df.isnull().sum().sum()

4

### 处理缺失值
    - 舍弃缺失值：当缺失值占比很低时
    - 使用平均数、中位数、众数等叙述性统计补全缺失值
    - 使用内插法补全缺失值：当字段数据呈线性规律

In [27]:
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


#### 舍弃包含缺失值的行

In [25]:
# 丢弃包含缺失值的行
df.dropna()

Unnamed: 0,name,gender,age
2,tom,M,35.0
3,ted,M,33.0
5,lisa,F,20.0


In [39]:
df = df.append(pd.DataFrame([[np.nan, np.nan, np.nan]], columns=["name", "gender", "age"], index=["6"]))
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0
6,,,


In [41]:
# 丢弃所有字段都为NaN的行
df = df.dropna(how='all')
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [42]:
# 舍弃包含两个及以上缺失值的行
df.dropna(thresh=2)

Unnamed: 0,name,gender,age
0,frank,M,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


#### 舍弃包含缺失值的列

In [44]:
df['empoyee'] = np.nan
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,name,gender,age,empoyee
0,frank,M,,
1,mary,,,
2,tom,M,35.0,
3,ted,M,33.0,
4,jean,,21.0,
5,lisa,F,20.0,


In [48]:
# axis=1表示以列为索引
df = df.dropna(axis=1, how='all')
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


#### 填补缺失值

In [49]:
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


##### 任意值填补缺失值

In [55]:
df.age = df.age.fillna(0)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,name,gender,age
0,frank,M,0.0
1,mary,,0.0
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [58]:
df.age = [age if age!=0 else np.nan for age in df.age]
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


##### 使用平均数补全缺失值

In [61]:
df['age'].fillna(df.age.mean())

0    27.25
1    27.25
2    35.00
3    33.00
4    21.00
5    20.00
Name: age, dtype: float64

In [66]:
# 用各性别年龄平均值填补缺失值
df['age'] = df['age'].fillna(df.groupby('gender')['age'].transform("mean"))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,name,gender,age
0,frank,M,34.0
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [67]:
df.age

0    34.0
1     NaN
2    35.0
3    33.0
4    21.0
5    20.0
Name: age, dtype: float64

##### 向前/后填补缺失值

In [68]:
# 向前填补缺失值
# method=pad/ffill 向前填补 
df.age.fillna(method='pad')

0    34.0
1    34.0
2    35.0
3    33.0
4    21.0
5    20.0
Name: age, dtype: float64

In [69]:
# 向后填补缺失值
# method=bfill/backfill 向后填补 
df.age.fillna(method='bfill')

0    34.0
1    35.0
2    35.0
3    33.0
4    21.0
5    20.0
Name: age, dtype: float64

##### 内插法填补缺失值

In [74]:
df1 = pd.DataFrame([
    [1, 870],
    [2, 900],
    [np.nan, np.nan],
    [4, 950],
    [5,1080],
    [6,1200]
], columns=['time','val'])
df1

Unnamed: 0,time,val
0,1.0,870.0
1,2.0,900.0
2,,
3,4.0,950.0
4,5.0,1080.0
5,6.0,1200.0


In [75]:
df1.interpolate()

Unnamed: 0,time,val
0,1.0,870.0
1,2.0,900.0
2,3.0,925.0
3,4.0,950.0
4,5.0,1080.0
5,6.0,1200.0
