In [43]:
##### SCIKIT-LEARN or SKLEARN #####
#
#  - Python Machine Learning Module
#  - Simple and efficient tools for data mining and data analysis
#  - Accessible to everybody, and reusable in various contexts
#  - Built on NumPy, SciPy, and matplotlib
#  - Open source, commercially usable - BSD license
#
# https://scikit-learn.org/stable/index.html
# https://scikit-learn.org/stable/user_guide.html 
# https://scikit-learn.org/stable/modules/classes.html

In [44]:
##### PRE-PROCESSING DATA #####
#
# https://scikit-learn.org/stable/modules/preprocessing.html
#
# Encoding Categorical Variables:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
#
# Transforming Prediction Targets:
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
#
# Standardization, Scaling, Normalization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
#
# Discretization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
# 
# Missing Value Imputation
# https://scikit-learn.org/stable/modules/impute.html
# 
# Polynomial Features
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
#
# Custom Transformers
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [45]:
import pandas as pd
import numpy as np

In [46]:
df = pd.read_csv('data/kaggleTitanic/sample.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,no,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,no,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,no,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,no,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,no,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,yes,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,yes,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [47]:
##### STANDARDIZATION, SCALING, NORMALIZATION #####
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html

In [48]:
#  Standardization: or mean removal and variance scaling
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
numericvars = ['Age', 'Fare']
from sklearn.preprocessing import StandardScaler
ss = StandardScaler(with_mean=True, with_std=True)
dfnumss = pd.DataFrame(ss.fit_transform(df[numericvars]), columns=['ss_'+x for x in numericvars])
dfnumss = pd.concat([df, dfnumss], axis=1)
dfnumss = dfnumss.drop(numericvars, axis=1)
dfnumss.head()

# these will be close to 0
#dfnumss[['ss_Age']].mean()
#dfnumss[['ss_Fare']].mean()
# these will be close to 1
#dfnumss[['ss_Age']].std()
#dfnumss[['ss_Fare']].std()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,ss_Age,ss_Fare
0,1,no,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,,S,-0.433703,-0.88299
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C85,C,0.701811,1.976819
2,3,yes,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,,S,-0.149825,-0.852844
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,C123,S,0.488902,1.164729
4,5,no,3,"Allen, Mr. William Henry",male,0,0,373450,,S,0.488902,-0.847261


In [49]:
#  Scaling: MinMaxScaler transforms features by scaling each feature to a given range.
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
numericvars = ['Age', 'Fare']
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
dfnumss = pd.DataFrame(mms.fit_transform(df[numericvars]), columns=['mms_'+x for x in numericvars])
dfnumss = pd.concat([df, dfnumss], axis=1)
dfnumss = dfnumss.drop(numericvars, axis=1)
dfnumss.head()

# these will be 0
#dfnumss[['mms_Age']].min()
#dfnumss[['mms_Fare']].min()
# these will be 1
#dfnumss[['mms_Age']].max()
#dfnumss[['mms_Fare']].max()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,mms_Age,mms_Fare
0,1,no,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,,S,0.384615,0.0
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C85,C,0.692308,1.0
2,3,yes,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,,S,0.461538,0.010541
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,C123,S,0.634615,0.716034
4,5,no,3,"Allen, Mr. William Henry",male,0,0,373450,,S,0.634615,0.012493


In [50]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [51]:
# (Sample Vector) Normalization: Normalization is the process of scaling individual samples to have unit norm. 
# This process can be useful if you plan to use a quadratic form such as the dot-product 
# or any other kernel to quantify the similarity of any pair of samples.
# l1: sum of abs values is 1
# l2: sum of square of values is 1
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
from sklearn.preprocessing import Normalizer
norm = Normalizer(norm='l2')
dfnorm = pd.DataFrame(norm.fit_transform(df), columns=['norm_'+x for x in df.columns])
dfnorm.head()

# dfnorm.apply(lambda x: abs(x)).sum(axis=1) # for l1 norm these will all be ones
# dfnorm.apply(lambda x: x*x).sum(axis=1) # for l2 norm these will all be ones

Unnamed: 0,norm_sepal length (cm),norm_sepal width (cm),norm_petal length (cm),norm_petal width (cm)
0,0.803773,0.551609,0.220644,0.031521
1,0.828133,0.50702,0.236609,0.033801
2,0.805333,0.548312,0.222752,0.034269
3,0.80003,0.539151,0.260879,0.034784
4,0.790965,0.569495,0.22147,0.031639


In [52]:
##### DISCRETIZATION (or quantization or binning)
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html

In [53]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [54]:
# KBinsDiscretizer: bin continuous data into intervals
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
from sklearn.preprocessing import KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') # read documentation for encode and strategy
dfkbd = pd.DataFrame(kbd.fit_transform(df), columns=['kbd_'+x for x in df.columns])

In [55]:
kbd.bin_edges_

array([array([4.3 , 5.  , 5.6 , 6.1 , 6.52, 7.9 ]),
       array([2. , 2.7, 3. , 3.1, 3.4, 4.4]),
       array([1.  , 1.5 , 3.9 , 4.64, 5.32, 6.9 ]),
       array([0.1 , 0.2 , 1.16, 1.5 , 1.9 , 2.5 ])], dtype=object)

In [56]:
dfkbd.head()

Unnamed: 0,kbd_sepal length (cm),kbd_sepal width (cm),kbd_petal length (cm),kbd_petal width (cm)
0,1.0,4.0,0.0,1.0
1,0.0,2.0,0.0,1.0
2,0.0,3.0,0.0,1.0
3,0.0,3.0,1.0,1.0
4,1.0,4.0,0.0,1.0


In [57]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [58]:
# Binarizer: binarize data (set feature values to 0 or 1) according to a threshold
# Binarizer is similar to the KBinsDiscretizer when k = 2, and when the bin edge is at the value threshold.
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
from sklearn.preprocessing import Binarizer
bnr = Binarizer(threshold=4.9)
dfbnr = pd.DataFrame(bnr.fit_transform(df[['sepal length (cm)']]), columns=['bnr_sepal length (cm)'])
dfbnr.head()

Unnamed: 0,bnr_sepal length (cm)
0,1.0
1,0.0
2,0.0
3,0.0
4,1.0
