In [2]:
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import numpy as np


In [3]:
data = [
    [-1,2],
    [-0.5,6],
    [0,10],
    [1,18]
]
pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [4]:
# 归一化
scaler = MinMaxScaler()
scaler = scaler.fit(data)
result = scaler.transform(data)
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [5]:
# 归一化
scaler = MinMaxScaler()
result = scaler.fit_transform(data)
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [6]:
# 逆转 归一化结果
result_ = scaler.inverse_transform(result)
result_

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [7]:
# 归一化到指定区间
scaler = MinMaxScaler(feature_range=[5,10])
result = scaler.fit_transform(data)
result

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

In [8]:
# 当数据集非常大的时候 fit会报错 使用partial_fit
import numpy as np
X = np.array([
    [-1,    2],
    [-0.5,  6],
    [0,     10],
    [1,     18]
])
X.min(axis=0)

array([-1.,  2.])

In [9]:
X.min(axis=1)

array([-1. , -0.5,  0. ,  1. ])

In [10]:
X_nor = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_nor

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [11]:
from sklearn.preprocessing import StandardScaler

In [16]:
data = [
    [-1,  2],
    [-0.5,6],
    [0,   10],
    [1,   18]
]
scaler = StandardScaler()
scaler.fit(data)
scaler.mean_
scaler.var_
s_std = scaler.transform(data)

In [24]:
s_std

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [37]:
import pandas as pd
from sklearn.impute import SimpleImputer
data = pd.read_csv('data/day08_Narrativedata.csv',index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [55]:
Age = data.loc[:,'Age'].values.reshape(-1,1)
Age[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [nan],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [nan],
       [31.],
       [nan]])

In [57]:
imp_mean = SimpleImputer(strategy='mean')   # 均值
imp_median = SimpleImputer(strategy='median')
imp_0 = SimpleImputer(strategy='constant',fill_value=0)

imp_mean = imp_mean.fit_transform(Age)
imp_median =  imp_median.fit_transform(Age)
imp_0 = imp_0.fit_transform(Age)

In [71]:
Embarked = data['Embarked'].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy='most_frequent')
imp_mode = imp_mode.fit_transform(Embarked)
data['Embarked'] = imp_mode
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  891 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [73]:
# dataframe 填补缺失值
import pandas as pd
from sklearn.impute import SimpleImputer
data = pd.read_csv('data/day08_Narrativedata.csv',index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [78]:
data['Age'] = data['Age'].fillna(data['Age'].median())
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [80]:
data.dropna(axis=0,inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       889 non-null    float64
 1   Sex       889 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  889 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.7+ KB


In [83]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
data = pd.read_csv('data/day08_Narrativedata.csv',index_col=0)
data

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No
...,...,...,...,...
886,27.0,male,S,No
887,19.0,female,S,Yes
888,,female,S,No
889,26.0,male,C,Unknown


In [92]:
label = data['Survived']
le =LabelEncoder()
label = le.fit_transform(label)  # 编码
ll = le.inverse_transform(label)  # 解码

In [105]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
data = pd.read_csv('data/day08_Narrativedata.csv',index_col=0)
data

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No
...,...,...,...,...
886,27.0,male,S,No
887,19.0,female,S,Yes
888,,female,S,No
889,26.0,male,C,Unknown


In [107]:
oe = OrdinalEncoder()
oe.fit(data.iloc[:,1:-1])
oe.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S', nan], dtype=object)]

In [110]:
oe = OrdinalEncoder()
data.iloc[:,1:-1] = oe.fit_transform(data.iloc[:,1:-1])
data

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,No
1,38.0,0.0,0.0,Yes
2,26.0,0.0,2.0,Yes
3,35.0,0.0,2.0,Yes
4,35.0,1.0,2.0,No
...,...,...,...,...
886,27.0,1.0,2.0,No
887,19.0,0.0,2.0,Yes
888,,0.0,2.0,No
889,26.0,1.0,0.0,Unknown


In [116]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
data = pd.read_csv('data/day08_Narrativedata.csv',index_col=0)

X = data.iloc[:,1:-1]
X

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [127]:
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
enc.get_feature_names()



array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S', 'x1_nan'],
      dtype=object)

In [128]:
result = OneHotEncoder(categories='auto').fit_transform(X).toarray()
result.shape

(891, 6)

In [135]:
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
newdata

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4,5
0,22.0,male,S,No,0.0,1.0,0.0,0.0,1.0,0.0
1,38.0,female,C,Yes,1.0,0.0,1.0,0.0,0.0,0.0
2,26.0,female,S,Yes,1.0,0.0,0.0,0.0,1.0,0.0
3,35.0,female,S,Yes,1.0,0.0,0.0,0.0,1.0,0.0
4,35.0,male,S,No,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
886,27.0,male,S,No,0.0,1.0,0.0,0.0,1.0,0.0
887,19.0,female,S,Yes,1.0,0.0,0.0,0.0,1.0,0.0
888,,female,S,No,1.0,0.0,0.0,0.0,1.0,0.0
889,26.0,male,C,Unknown,0.0,1.0,1.0,0.0,0.0,0.0


In [136]:
newdata.drop(['Sex','Embarked'],axis=1,inplace=True)
newdata

Unnamed: 0,Age,Survived,0,1,2,3,4,5
0,22.0,No,0.0,1.0,0.0,0.0,1.0,0.0
1,38.0,Yes,1.0,0.0,1.0,0.0,0.0,0.0
2,26.0,Yes,1.0,0.0,0.0,0.0,1.0,0.0
3,35.0,Yes,1.0,0.0,0.0,0.0,1.0,0.0
4,35.0,No,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
886,27.0,No,0.0,1.0,0.0,0.0,1.0,0.0
887,19.0,Yes,1.0,0.0,0.0,0.0,1.0,0.0
888,,No,1.0,0.0,0.0,0.0,1.0,0.0
889,26.0,Unknown,0.0,1.0,1.0,0.0,0.0,0.0


In [140]:
newdata.columns = ['Age','Survived','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S','Embarked_nan']
newdata

Unnamed: 0,Age,Survived,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,22.0,No,0.0,1.0,0.0,0.0,1.0,0.0
1,38.0,Yes,1.0,0.0,1.0,0.0,0.0,0.0
2,26.0,Yes,1.0,0.0,0.0,0.0,1.0,0.0
3,35.0,Yes,1.0,0.0,0.0,0.0,1.0,0.0
4,35.0,No,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
886,27.0,No,0.0,1.0,0.0,0.0,1.0,0.0
887,19.0,Yes,1.0,0.0,0.0,0.0,1.0,0.0
888,,No,1.0,0.0,0.0,0.0,1.0,0.0
889,26.0,Unknown,0.0,1.0,1.0,0.0,0.0,0.0


In [159]:
# 连续性变量 二值化 与 分段
from sklearn.preprocessing import Binarizer
import pandas as pd
data = pd.read_csv('data/day08_Narrativedata.csv',index_col=0)

data['Age'] = data['Age'].fillna(data['Age'].mean())
X = data.iloc[:,0].values.reshape(-1,1)
X

array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [29.69911765],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ],
       [ 4.        ],
       [58.        ],
       [20.        ],
       [39.        ],
       [14.        ],
       [55.        ],
       [ 2.        ],
       [29.69911765],
       [31.        ],
       [29.69911765],
       [35.        ],
       [34.        ],
       [15.        ],
       [28.        ],
       [ 8.        ],
       [38.        ],
       [29.69911765],
       [19.        ],
       [29.69911765],
       [29.69911765],
       [40.        ],
       [29.69911765],
       [29.69911765],
       [66.        ],
       [28.        ],
       [42.        ],
       [29.69911765],
       [21.        ],
       [18.        ],
       [14.        ],
       [40.        ],
       [27.        ],
       [29.69911765],
       [ 3.        ],
       [19.        ],
       [29

In [160]:
tranformer = Binarizer(threshold=30).fit_transform(X)
print(tranformer)
data['Age'] = tranformer
data.head(10)

[[0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.

Unnamed: 0,Age,Sex,Embarked,Survived
0,0.0,male,S,No
1,1.0,female,C,Yes
2,0.0,female,S,Yes
3,1.0,female,S,Yes
4,1.0,male,S,No
5,0.0,male,Q,No
6,1.0,male,S,No
7,0.0,male,S,No
8,0.0,female,S,Yes
9,0.0,female,C,Yes


In [198]:
from sklearn.preprocessing import KBinsDiscretizer
import pandas as pd
data = pd.read_csv('data/day08_Narrativedata.csv',index_col=0)
data['Age'] = data['Age'].fillna(data['Age'].mean())
kbd = KBinsDiscretizer(n_bins=5,encode='onehot')
result = kbd.fit_transform(data['Age'].values.reshape(-1,1)).toarray()
result

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [199]:
data = pd.concat([data,pd.DataFrame(result)],axis=1)
data

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.000000,male,S,No,0.0,1.0,0.0,0.0,0.0
1,38.000000,female,C,Yes,0.0,0.0,0.0,0.0,1.0
2,26.000000,female,S,Yes,0.0,1.0,0.0,0.0,0.0
3,35.000000,female,S,Yes,0.0,0.0,0.0,1.0,0.0
4,35.000000,male,S,No,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
886,27.000000,male,S,No,0.0,1.0,0.0,0.0,0.0
887,19.000000,female,S,Yes,1.0,0.0,0.0,0.0,0.0
888,29.699118,female,S,No,0.0,0.0,0.0,1.0,0.0
889,26.000000,male,C,Unknown,0.0,1.0,0.0,0.0,0.0


In [193]:
from sklearn.preprocessing import KBinsDiscretizer
import pandas as pd
data = pd.read_csv('data/day08_Narrativedata.csv',index_col=0)
data['Age'] = data['Age'].fillna(data['Age'].mean())
kbd = KBinsDiscretizer(n_bins=5,encode='ordinal')
result = kbd.fit_transform(data['Age'].values.reshape(-1,1))