In [161]:
##### SCIKIT-LEARN or SKLEARN #####
#
#  - Python Machine Learning Module
#  - Simple and efficient tools for data mining and data analysis
#  - Accessible to everybody, and reusable in various contexts
#  - Built on NumPy, SciPy, and matplotlib
#  - Open source, commercially usable - BSD license
#
# https://scikit-learn.org/stable/index.html
# https://scikit-learn.org/stable/user_guide.html 
# https://scikit-learn.org/stable/modules/classes.html

In [162]:
##### PRE-PROCESSING DATA #####
#
# https://scikit-learn.org/stable/modules/preprocessing.html
#
# Encoding Categorical Variables:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
#
# Transforming Prediction Targets:
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
#
# Standardization, Scaling, Normalization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
#
# Discretization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
# 
# Missing Value Imputation
# https://scikit-learn.org/stable/modules/impute.html
# 
# Polynomial Features
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
#
# Custom Transformers
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('sample.csv')
df.head()
#df['Pclass'].unique()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,no,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,no,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
##### ENCODING CATEGORICAL VARIABLES #####
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [6]:
##### one-hot encoding of categorical variables: 
##### when you want to convert categorical levels into dummy variables

In [7]:
##### method 1: use pandas get_dummies to convert categorical variable into dummy/indicator variables
# let's say we wanted to one hot encode categorical vaiable Pclass
# use "drop_first=True" to get k-1 dummies out of k categorical levels by removing the first level and avoid multicollinearity 
dfcat = pd.get_dummies(df['Pclass'], prefix='Pclass', drop_first=True) # dummy_na=True
dfcat = pd.concat([df, dfcat], axis=1)
dfcat = dfcat.drop('Pclass', axis=1)
dfcat.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_2,Pclass_3
0,1,no,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1
1,2,yes,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,yes,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1
3,4,yes,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0
4,5,no,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1


In [10]:
##### method 2: use sklearn OneHotEncoder to encode categorical features as a one-hot numeric array
# let's say we wanted to one hot encode categorical vaiable Pclass
# Note: with the SKLearn OHE, you might need to take care of NaNs in your categorical data before using OHE since you OHE might thrown an error otherwise
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, dtype=int)
dfcat = pd.DataFrame(ohe.fit_transform(df[['Pclass']]), columns=ohe.get_feature_names())

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
dfcat.head()

Unnamed: 0,x0_1.0,x0_2.0,x0_3.0
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [12]:
# use slicing to get k-1 dummies out of k categorical levels by removing the first level and avoid multicollinearity 
dfcat = dfcat.iloc[:,1:]
dfcat.head()

Unnamed: 0,x0_2.0,x0_3.0
0,0,1
1,0,0
2,0,1
3,0,0
4,0,1


In [13]:
dfcat = pd.concat([df, dfcat], axis=1)
dfcat = dfcat.drop('Pclass', axis=1)
dfcat.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,x0_2.0,x0_3.0
0,1,no,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1
1,2,yes,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,yes,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1
3,4,yes,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0
4,5,no,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1


In [14]:
##### one-hot encoding of categorical variables: 
##### when you want to get k dummies out of k categorical levels.
##### this is typically fine when you are only concerned with prediction and not with interpretation and estiamte of coefficients

In [15]:
categoricalvars = ['Pclass', 'Sex', 'Embarked']
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, dtype=int)
dfcat = pd.DataFrame(ohe.fit_transform(df[categoricalvars]), columns=ohe.get_feature_names())
dfcat = pd.concat([df, dfcat], axis=1)
dfcat = dfcat.drop(categoricalvars, axis=1)
dfcat.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,x0_1,x0_2,x0_3,x1_female,x1_male,x2_C,x2_Q,x2_S
0,1,no,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,0,1,0,1,0,0,1
1,2,yes,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,0,1,0,1,0,0
2,3,yes,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,1,0,0,0,1
3,4,yes,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,1,0,0,0,1
4,5,no,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,0,1,0,1,0,0,1


In [174]:
##### TRANSFORMING PREDICTION TARGET
# These are transformers that are not intended to be used on features, only on supervised learning targets.
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html

In [17]:
df = pd.read_csv('iris.data.csv')
df.sample(frac=0.05)
#df['iris species'].unique()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),iris species
73,6.1,2.8,4.7,1.2,Iris-versicolor
14,5.8,4.0,1.2,0.2,Iris-setosa
10,5.4,3.7,1.5,0.2,Iris-setosa
67,5.8,2.7,4.1,1.0,Iris-versicolor
6,4.6,3.4,1.4,0.3,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
75,6.6,3.0,4.4,1.4,Iris-versicolor
135,7.7,3.0,6.1,2.3,Iris-virginica


In [18]:
# Label Encoding (similar to OrdinalEncoder for Categorical Features)
# use LabelEncoder to Encode labels with value between 0 and n_classes-1
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfle = pd.DataFrame(le.fit_transform(df['iris species']), columns=['iris species LE'])
dfle = pd.concat([df, dfle], axis=1)
dfle = dfle.drop('iris species', axis=1)
dfle.sample(frac=0.05)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),iris species LE
23,5.1,3.3,1.7,0.5,0
30,4.8,3.1,1.6,0.2,0
53,5.5,2.3,4.0,1.3,1
90,5.5,2.6,4.4,1.2,1
84,5.4,3.0,4.5,1.5,1
24,4.8,3.4,1.9,0.2,0
1,4.9,3.0,1.4,0.2,0
102,7.1,3.0,5.9,2.1,2


In [19]:
# Label Binarization (similar to OneHotEncoder for Categorical Features)
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
dflb = pd.DataFrame(lb.fit_transform(df['iris species']), columns=[x+' LB' for x in lb.classes_])
dflb = pd.concat([df, dflb], axis=1)
dflb = dflb.drop('iris species', axis=1)
dflb.sample(frac=0.05)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Iris-setosa LB,Iris-versicolor LB,Iris-virginica LB
3,4.6,3.1,1.5,0.2,1,0,0
149,5.9,3.0,5.1,1.8,0,0,1
143,6.8,3.2,5.9,2.3,0,0,1
110,6.5,3.2,5.1,2.0,0,0,1
103,6.3,2.9,5.6,1.8,0,0,1
64,5.6,2.9,3.6,1.3,0,1,0
26,5.0,3.4,1.6,0.4,1,0,0
62,6.0,2.2,4.0,1.0,0,1,0


In [20]:
# Multilabel Binarizer (converts lists of sets or tuples into multilabel format)
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
labels = [('sci-fi', 'thriller'), ('comedy',)] 
pd.DataFrame(mlb.fit_transform(labels), columns=mlb.classes_)

Unnamed: 0,comedy,sci-fi,thriller
0,0,1,1
1,1,0,0
