'''''
{
"title": "One Hot Encoding",
"keywords": "Ordinal Encoder, OHE, One-Hot-Encoder",
"categories": "",
"description": "hier werden die Möglihckeiten gelitste wie man ein Train Test split vollziehen kann",
"level": "50",
"pageID": "16112020-OneHotEncodingOrdinalEncoding"
}
'''''

# NaN-Werte
Die meisten Algorithmen können nicht mit NAN werten umgehen. Daher gibt es hierbei unterschiedliche Anwendungsoptionen. Die zeilen oder Spalten könnnen entweder weggelassen werden. Alternativ können mit hilfe eines Imputers Ein Wert in das leere Feld gesetzt werden.

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
#DatenLaden
HOUSING_PATH = os.path.join("datasets", "housing")
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing = load_housing_data()
housing.shape

(20640, 10)

In [4]:
#housing.dtypes
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


# One Hote Encoding
Bei OHE werden kategorische = object Variablen in numerische Werte übertragen. Dazu wird zu jeder mögichen Variable eine Liste angelegt und aus jedem möglichen Wert eine eigene Spate

In [5]:
housing_cat = housing[['ocean_proximity']]
print(housing_cat.head(10))
print(housing_cat.shape)
print(housing_cat.ocean_proximity.unique()) # Anzeige aller möglichen Ausprägungen

  ocean_proximity
0        NEAR BAY
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
5        NEAR BAY
6        NEAR BAY
7        NEAR BAY
8        NEAR BAY
9        NEAR BAY
(20640, 1)
['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']


# OrdinalEncoder mit SK-Learn
Beim Ordinal-Encoder werden lediglich die einzlenen Kategorien mit Hilfe einer Map übertragen. Vor und nach dem Ordinal-Encoding besteht lediglich 1 bezugnehmende Spalte

In [6]:
try:
    from sklearn.preprocessing import OrdinalEncoder
except ImportError:
    from future_encoders import OrdinalEncoder # Scikit-Learn < 0.20

In [7]:
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
print(housing_cat_encoded[:10])
housing_cat_encoded.shape

[[3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]]


(20640, 1)

In [8]:
# Anzeige aktueller Ordinal-Encoder Reihenfolge
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

# One Hot Encoding
Im gegensatz zum Ordinal-Encoder werden beim OHE aus einer Spalte mit M Unique Values = Ausprägungen M Spalten. Hier zunächst die Basic Funktionen die im Kontext von OHE von Bedeutung sind.

In [9]:
try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20
# Vorarbeit fürs verständins
cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
print(housing_cat_1hot)


[[0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 ...
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]]


In [10]:
print(len(housing_cat.ocean_proximity.unique()))
print(housing_cat.shape)
print(housing_cat_1hot.shape)
print(type(housing_cat))
print(type(housing_cat_1hot))
print(cat_encoder.categories_)

5
(20640, 1)
(20640, 5)
<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)]


In [11]:
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [12]:
print(housing.shape)
print(housing_cat_1hot.shape)


(20640, 10)
(20640, 5)


In [13]:
# Inverse Transform
print(cat_encoder.inverse_transform([[1, 0, 0, 0, 0]]))
print(cat_encoder.inverse_transform([[0, 1, 0, 0, 0]]))
print(cat_encoder.inverse_transform([[0, 0, 1, 0, 0]]))
print(cat_encoder.inverse_transform([[0, 0, 0, 1, 0]]))
print(cat_encoder.inverse_transform([[0, 0, 0, 0, 1]]))

[['<1H OCEAN']]
[['INLAND']]
[['ISLAND']]
[['NEAR BAY']]
[['NEAR OCEAN']]


In [14]:
print(housing_cat.columns)
cat_encoder.get_feature_names(['ocean_proximity'])

Index(['ocean_proximity'], dtype='object')


array(['ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'], dtype=object)

# OHE auf das Dataframe angewendet

In [15]:
print(housing.shape)
housing_cat = housing[['ocean_proximity']]
housing = housing.drop("ocean_proximity", axis=1)  
print(housing.shape)

(20640, 10)
(20640, 9)


In [29]:
cat_encoder = OneHotEncoder(sparse=False)
housing_ocean_proximity_cat_1hot = cat_encoder.fit_transform(housing_cat)
titles = cat_encoder.get_feature_names(['ocean_proximity'])
partOHEdf = pd.DataFrame(housing_ocean_proximity_cat_1hot, columns=titles)
housingDF = pd.concat([housing,partOHEdf],axis=1)

print(housingDF.shape)
'''
print(housing.shape)
print(partOHEdf.shape)
print(partOHEdf)
print(type(housing_ocean_proximity_cat_1hot))
print(housing_ocean_proximity_cat_1hot.shape)
'''

(20640, 14)


'\nprint(housing.shape)\nprint(partOHEdf.shape)\nprint(partOHEdf)\nprint(type(housing_ocean_proximity_cat_1hot))\nprint(housing_ocean_proximity_cat_1hot.shape)\n'