# Preprocess the dataset

### 1. Encoding the features

In [29]:
import pandas as pd
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],['red', 'L', 13.5, 'class2'],['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [30]:
# map the ordinal features

size_map = {'M':1, 'L':2,'XL':3}
df['size'] = df['size'].map(size_map)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [32]:
# encode the class labels

from sklearn.preprocessing import LabelEncoder
Le = LabelEncoder()
df['classlabel'] = Le.fit_transform(df['classlabel'].values)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [38]:
# get one-hot encoding on non-nominal features

af = pd.get_dummies(df[['color','size','price','classlabel']])
af

Unnamed: 0,size,price,classlabel,color_blue,color_green,color_red
0,1,10.1,0,0.0,1.0,0.0
1,2,13.5,1,0.0,0.0,1.0
2,3,15.3,0,1.0,0.0,0.0


### 2. Data scaling and partitioning

In [49]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [52]:
import numpy as np
df_wine.columns = ['Class label', 'Alcohol','Malic acid', 'Ash','Alcalinity of ash', 'Magnesium','Total phenols', 'Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity', 'Hue','OD280/OD315 of diluted wines','Proline']
np.unique(df_wine['Class label'])
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [53]:
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
Class label                     178 non-null int64
Alcohol                         178 non-null float64
Malic acid                      178 non-null float64
Ash                             178 non-null float64
Alcalinity of ash               178 non-null float64
Magnesium                       178 non-null int64
Total phenols                   178 non-null float64
Flavanoids                      178 non-null float64
Nonflavanoid phenols            178 non-null float64
Proanthocyanins                 178 non-null float64
Color intensity                 178 non-null float64
Hue                             178 non-null float64
OD280/OD315 of diluted wines    178 non-null float64
Proline                         178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 19.5 KB


In [63]:
from sklearn.cross_validation import train_test_split
X, y = df_wine.iloc[:,1:], df_wine.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

''' Note: fit on training data and apply the parameters on both training and test data '''

# Normalization
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

# Standardization
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

### 3. Feature selection - regulation

In [71]:
''' Overfitting: a model fits the parameters too closely to the data points/observations in the training dataset but 
it does not generalize very well to the test/real data. It has a high variance.'''

# Here we use L1 regulation 
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty = 'l1', C= 0.1)
lr.fit(X_train_std,y_train)
print(lr.score(X_train_std,y_train))
print(lr.score(X_test_std, y_test))


# One can tune the parameter C (inverse of regulation strength) 
# to see how the coefficients shrink when increasing the regulation strength.
lr.coef_

0.983870967742
0.981481481481


array([[ 0.2803952 ,  0.        ,  0.        , -0.02806246,  0.        ,
         0.        ,  0.71016269,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.23572832],
       [-0.64400696, -0.06881503, -0.05719851,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        , -0.92673373,
         0.06016375,  0.        , -0.37112266],
       [ 0.        ,  0.06161542,  0.        ,  0.        ,  0.        ,
         0.        , -0.63522219,  0.        ,  0.        ,  0.49774657,
        -0.35836174, -0.57194624,  0.        ]])

### 4. Feature importance in Random Forest

In [77]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 10000, random_state = 0, n_jobs = -1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
feat_labels = df_wine.columns[1:].values
print(feat_labels)
print(importances)

['Alcohol' 'Malic acid' 'Ash' 'Alcalinity of ash' 'Magnesium'
 'Total phenols' 'Flavanoids' 'Nonflavanoid phenols' 'Proanthocyanins'
 'Color intensity' 'Hue' 'OD280/OD315 of diluted wines' 'Proline']
[ 0.10658906  0.02539968  0.01391619  0.03203319  0.02207807  0.0607176
  0.15094795  0.01464516  0.02235112  0.18248262  0.07824279  0.1319868
  0.15860977]


In [85]:
indices = np.argsort(importances)[::-1]
# array([ 9, 12,  6, 11,  0, 10,  5,  3,  1,  8,  4,  7,  2])

for f in range(X_train.shape[1]):
    print('%2d) %-*s %.3f' % (f+1, 50, feat_labels[indices[f]], importances[indices[f]]))
    
# %2d 表示两位数对齐； %.3f 表示保留三维小数点后数字； %-*s 表示留多少空间给字符串

 1) Color intensity                                    0.182
 2) Proline                                            0.159
 3) Flavanoids                                         0.151
 4) OD280/OD315 of diluted wines                       0.132
 5) Alcohol                                            0.107
 6) Hue                                                0.078
 7) Total phenols                                      0.061
 8) Alcalinity of ash                                  0.032
 9) Malic acid                                         0.025
10) Proanthocyanins                                    0.022
11) Magnesium                                          0.022
12) Nonflavanoid phenols                               0.015
13) Ash                                                0.014
