# python map

In [1]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [2]:
size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}

df['size'].map?
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [3]:
df['price'].astype(int)

0    10
1    13
2    15
Name: price, dtype: int32

In [4]:
df[['price']].astype(int)

Unnamed: 0,price
0,10
1,13
2,15


In [7]:
import numpy as np
np.where?
np.where(df.price > 12, 'Over 12', '12 or cheaper')

array(['12 or cheaper', 'Over 12', 'Over 12'], dtype='<U13')

In [8]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [9]:
df['price'] = np.where(df.price > 12, 'Over 12', '12 or cheaper')
df

Unnamed: 0,color,size,price,classlabel
0,green,1,12 or cheaper,class1
1,red,2,Over 12,class2
2,blue,3,Over 12,class1


# sklearn LabelEncoder

In [10]:
from sklearn.preprocessing import LabelEncoder

df = pd.DataFrame([['green', 'S', 10.1, 'class1'],['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])

df.columns = ['color', 'size', 'price', 'classlabel']
LabelEncoder?
lb = LabelEncoder() # Encode target labels with value between 0 and n_classes-1.
df['size'] = lb.fit_transform(df['size'])
df

Unnamed: 0,color,size,price,classlabel
0,green,2,10.1,class1
1,green,1,10.1,class1
2,red,0,13.5,class2
3,blue,3,15.3,class1


# pandas

In [13]:
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])

df.columns = ['color', 'size', 'price', 'classlabel']

df["size"] = df["size"].astype('category')
df["size"] = df["size"].cat.codes
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,0,13.5,class2
2,blue,2,15.3,class1


# Encoding class labels

In [14]:
np.unique(df['classlabel'])

array(['class1', 'class2'], dtype=object)

In [15]:
import numpy as np

# create a mapping dict
# to convert class labels from strings to integers
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [16]:
# to convert class labels from strings to integers
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,0,13.5,1
2,blue,2,15.3,0


In [17]:
{v: k for k, v in class_mapping.items()}

{0: 'class1', 1: 'class2'}

In [18]:
# reverse the class label mapping
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,0,13.5,class2
2,blue,2,15.3,class1


In [19]:
class_mapping, inv_class_mapping

({'class1': 0, 'class2': 1}, {0: 'class1', 1: 'class2'})

# One Hot Encoding

In [22]:
df['color'].values

array(['green', 'red', 'blue'], dtype=object)

In [23]:
X = df[['color', 'size', 'price']].values
X

array([['green', 1, 10.1],
       ['red', 0, 13.5],
       ['blue', 2, 15.3]], dtype=object)

In [24]:
X[:, 0]

array(['green', 'red', 'blue'], dtype=object)

In [25]:
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 0, 13.5],
       [0, 2, 15.3]], dtype=object)

In [26]:
X[:, 0:1]

array([[1],
       [2],
       [0]], dtype=object)

In [27]:
X[:, 0]

array([1, 2, 0], dtype=object)

In [34]:
from sklearn.preprocessing import OneHotEncoder
# OneHotEncoder?
ohe = OneHotEncoder() # Encode categorical features as a one-hot numeric array.
# ohe.fit_transform(X[:, 0:1])
X2 = ohe.fit_transform(X[:, 0:1]).toarray()
X2, type(X2)

(array([[0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.]]),
 numpy.ndarray)

In [40]:
ohe2 = OneHotEncoder(sparse=False) # sparse=False, True
ohe2.fit_transform(X[:, 0:1])

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [41]:
np.concatenate((X2, X[:, 1:]), axis=1)

array([[0.0, 1.0, 0.0, 1, 10.1],
       [0.0, 0.0, 1.0, 0, 13.5],
       [1.0, 0.0, 0.0, 2, 15.3]], dtype=object)

In [42]:
df = pd.DataFrame(np.concatenate((X2, X[:, 1:]), axis=1), columns = ['blue', 'green', 'red', 'size', 'price'])
df

Unnamed: 0,blue,green,red,size,price
0,0,1,0,1,10.1
1,0,0,1,0,13.5
2,1,0,0,2,15.3


# Method 2: for One Hot Encoding 

In [43]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [48]:
# import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# create one hot encoder
one_hot_encoder = OneHotEncoder()

# create a copy of the dataset
df_ohe = df.copy()

# transform dataset 
ohelabels = one_hot_encoder.fit_transform(df_ohe).toarray()
df_ohe = pd.DataFrame(ohelabels, columns=one_hot_encoder.get_feature_names())
df_ohe

Unnamed: 0,x0_blue,x0_green,x0_red,x1_L,x1_M,x1_XL,x2_10.1,x2_13.5,x2_15.3,x3_class1,x3_class2
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [47]:
one_hot_encoder.fit_transform(df_ohe).toarray()

array([[0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.]])

# One Hot Encoding by Pandas

In [49]:

df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']

pd.get_dummies(df, columns=["color"], prefix='is', prefix_sep='_')

Unnamed: 0,size,price,classlabel,is_blue,is_green,is_red
0,M,10.1,class1,0,1,0
1,L,13.5,class2,0,0,1
2,XL,15.3,class1,1,0,0


In [50]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [52]:
pd.DataFrame(np.arange(10))

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [53]:
digit1 = np.arange(10)

df = pd.DataFrame(digit1)
df.columns = ['digit']
df

Unnamed: 0,digit
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [54]:
pd.get_dummies(df, columns=["digit"], prefix='is', prefix_sep='~')

Unnamed: 0,is~0,is~1,is~2,is~3,is~4,is~5,is~6,is~7,is~8,is~9
0,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0,0,0,1


In [55]:
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df["pant_color"]=df["color"]

pd.get_dummies(df, columns=["color", "pant_color"], prefix=['is',' pant'], prefix_sep='_')

Unnamed: 0,size,price,classlabel,is_blue,is_green,is_red,pant_blue,pant_green,pant_red
0,M,10.1,class1,0,1,0,0,1,0
1,L,13.5,class2,0,0,1,0,0,1
2,XL,15.3,class1,1,0,0,1,0,0


In [56]:
df

Unnamed: 0,color,size,price,classlabel,pant_color
0,green,M,10.1,class1,green
1,red,L,13.5,class2,red
2,blue,XL,15.3,class1,blue


In [58]:
pd.get_dummies(df, columns=["color", "pant_color"], prefix=['is',' pant'], prefix_sep=['_', '~!'])

Unnamed: 0,size,price,classlabel,is_blue,is_green,is_red,pant~!blue,pant~!green,pant~!red
0,M,10.1,class1,0,1,0,0,1,0
1,L,13.5,class2,0,0,1,0,0,1
2,XL,15.3,class1,1,0,0,1,0,0


# Partitioning a dataset into a separate training and test set

In [59]:
pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [60]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

# if the Wine dataset is temporarily unavailable from the
# UCI machine learning repository, un-comment the above line
# of code to load the dataset from a local path:

# df_wine = pd.read_csv('wine.data', header=None)


df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine

Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [61]:
df_wine.iloc[:, 1:]

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [67]:
df_wine.loc[:, 1:]

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [1] of <class 'int'>

In [68]:
df_wine.iloc[:, 0]

0      1
1      1
2      1
3      1
4      1
      ..
173    3
174    3
175    3
176    3
177    3
Name: Class label, Length: 178, dtype: int64

In [70]:
178 * 0.3

53.4

In [71]:
from sklearn.model_selection import train_test_split

# X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X, y = df_wine.iloc[:, 1:], df_wine.iloc[:, 0]

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.3, 
                     random_state=0, 
                     stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((124, 13), (54, 13), (124,), (54,))

In [72]:
np.bincount(y_train)

array([ 0, 41, 50, 33], dtype=int64)

In [73]:
np.bincount(y_test)

array([ 0, 18, 21, 15], dtype=int64)

# Bringing features onto the same scale

## 常態化(Normalization): if the data distribution is flat

In [74]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)
X_test_norm

array([[ 0.69005848,  0.22924901,  0.64052288,  0.30645161,  0.55555556,
         0.69655172,  0.51687764,  0.52      ,  0.39873418,  0.40497336,
         0.69148936,  0.60805861,  0.78245364],
       [ 0.22222222,  0.14031621,  0.54248366,  0.40860215,  0.41975309,
         0.3137931 ,  0.29746835,  0.64      ,  0.19303797,  0.10746004,
         1.03191489,  0.35164835,  0.05492154],
       [ 0.87134503,  0.22332016,  0.49019608,  0.17204301,  0.32098765,
         0.52413793,  0.45991561,  0.34      ,  0.49367089,  0.31172291,
         0.57446809,  0.84615385,  0.72182596],
       [ 0.52339181,  0.2055336 ,  0.2875817 , -0.01075269,  0.37037037,
         0.57586207,  0.51054852,  0.26      ,  0.2721519 ,  0.23445826,
         0.60638298,  0.78021978,  0.55064194],
       [ 0.28070175,  0.06521739,  0.26143791,  0.38172043,  0.22222222,
         0.87586207,  0.71940928,  0.22      ,  0.48417722,  0.24511545,
         0.59574468,  0.54945055,  0.2724679 ],
       [ 0.30409357,  0.452569

In [76]:
X_train_norm.min(), X_train_norm.max()

(0.0, 1.0)

In [75]:
X_test_norm.min(), X_test_norm.max()

(-0.2222222222222222, 1.3085106382978724)

## 標準化(Standardization): if the data distribution is normal distribution

In [77]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
X_test_std

array([[ 8.94437367e-01, -3.88117877e-01,  1.10073064e+00,
        -8.12017114e-01,  1.13201117e+00,  1.09807851e+00,
         7.12041017e-01,  1.81013423e-01,  6.62804643e-02,
         5.12859235e-01,  7.96297849e-01,  4.48295020e-01,
         1.90593792e+00],
       [-1.04879931e+00, -7.72993966e-01,  5.41190056e-01,
        -2.40938809e-01,  3.49414498e-01, -7.07219221e-01,
        -3.08121293e-01,  6.76138376e-01, -1.03520519e+00,
        -9.06567274e-01,  2.24570604e+00, -5.61881713e-01,
        -1.22874035e+00],
       [ 1.64744158e+00, -4.13776283e-01,  2.42768413e-01,
        -1.56343594e+00, -2.19746720e-01,  2.84881333e-01,
         4.47191186e-01, -5.61674005e-01,  5.74658458e-01,
         6.79643589e-02,  2.98063782e-01,  1.38631627e+00,
         1.64471473e+00],
       [ 2.02159300e-01, -4.90751501e-01, -9.13615453e-01,
        -2.58536553e+00,  6.48338887e-02,  5.28840485e-01,
         6.82613258e-01, -8.91757306e-01, -6.11556861e-01,
        -3.00662824e-01,  4.33945800e

## by python 

In [78]:
ex = np.array([0, 1, 2, 3, 4, 5])

print('standardized:', (ex - ex.mean()) / ex.std())

# Please note that pandas uses ddof=1 (sample standard deviation) 
# by default, whereas NumPy's std method and the StandardScaler
# uses ddof=0 (population standard deviation)

# normalize
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))

standardized: [-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]
normalized: [0.  0.2 0.4 0.6 0.8 1. ]
