<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-libaries" data-toc-modified-id="Import-libaries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import libaries</a></span></li><li><span><a href="#create-csv-file" data-toc-modified-id="create-csv-file-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>create csv file</a></span></li><li><span><a href="#Read-csv-file-into-pandas-dataframe" data-toc-modified-id="Read-csv-file-into-pandas-dataframe-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Read csv file into pandas dataframe</a></span></li><li><span><a href="#Identifying-missing-values" data-toc-modified-id="Identifying-missing-values-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Identifying missing values</a></span></li><li><span><a href="#Eliminating-missing-values" data-toc-modified-id="Eliminating-missing-values-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Eliminating missing values</a></span></li><li><span><a href="#Imputing-missing-values" data-toc-modified-id="Imputing-missing-values-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Imputing missing values</a></span></li><li><span><a href="#Nominal-and-ordinal-features" data-toc-modified-id="Nominal-and-ordinal-features-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Nominal and ordinal features</a></span></li><li><span><a href="#Mapping-ordinal-features" data-toc-modified-id="Mapping-ordinal-features-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Mapping ordinal features</a></span></li><li><span><a href="#Encoding-class-labels" data-toc-modified-id="Encoding-class-labels-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Encoding class labels</a></span></li><li><span><a href="#Labelencoder" data-toc-modified-id="Labelencoder-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Labelencoder</a></span></li><li><span><a href="#One-hot-encoder" data-toc-modified-id="One-hot-encoder-11"><span class="toc-item-num">11&nbsp;&nbsp;</span>One-hot encoder</a></span></li></ul></div>

# Import libaries

In [1]:
import pandas as pd
from io import StringIO

# create csv file

In [2]:
csv_data = \
'''A,B,C,D
 1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# Read csv file into pandas dataframe

In [3]:
#to convert the string to unicode:
 # csv_data = unicode(csv_data)
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


# Identifying missing values

In [4]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

# Eliminating missing values

In [5]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [6]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [7]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [8]:
# Drop rows that have less than 4 real values:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
#only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


# Imputing missing values

In [16]:
import numpy as np
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

# Nominal and ordinal features

In [1]:
import pandas as pd
df = pd.DataFrame([
['green', 'M', 10.1, 'class1'],
['red', 'L', 13.5, 'class2'],
['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


# Mapping ordinal features

In [2]:
size_mapping = {
'XL': 3,
'L': 2,'M': 1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [3]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

# Encoding class labels

In [4]:
import numpy as np
class_mapping = {label:idx for idx,label in
enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [5]:
df['classlabel'] = df['classlabel'].map(class_mapping) 
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [6]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


# Labelencoder

In [7]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 1, 0])

In [8]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

In [9]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

# One-hot encoder

In [10]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
print('--- OneHotEncoder ---')
ct = ColumnTransformer([('my_ohe', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X) #.toarray()
print(X)

--- OneHotEncoder ---
[[1.0 0.0 1.0 0.0 1 10.1]
 [1.0 0.0 0.0 1.0 2 13.5]
 [0.0 1.0 0.0 0.0 3 15.3]]


In [15]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


In [16]:
pd.get_dummies(df[['price', 'color', 'size']],
drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0
