In [1]:
%matplotlib inline

# 4.3.4. Encoding categorical features
Often features are not given as continuous values but categorical. For example a person could have features 
["male", "female"], ["from Europe", "from US", "from Asia"], ["uses Firefox", "uses Chrome", "uses Safari", "uses Internet Explorer"]. 
Such features can be efficiently coded as integers, 
for instance:
["male", "from US", "uses Internet Explorer"] could be expressed as [0, 1, 3] 
while ["female", "from Asia", "uses Chrome"] would be [1, 2, 1].

["male", "from Europe", "uses Internet Explorer"] could be expressed as [0, 0, 3] 
["female", "from US", "uses Firefox"] could be expressed as [1, 1, 0]
["male", "from Asia", "uses Chrome"] could be expressed as [0, 2, 1]
["female", "from Europe", "uses Safari"] would be [1, 0, 2].

In [106]:
df=np.array([["male", "from Europe", "uses Internet Explorer"],["female", "from US", "uses Firefox"],["male", "from Asia", "uses Chrome"],\
             ["female", "from Europe", "uses Safari"]],dtype = str)
print type(df)
df = pd.DataFrame(df)

columns = ['gender','continent','web']
df.columns= columns
df

<type 'numpy.ndarray'>


Unnamed: 0,gender,continent,web
0,male,from Europe,uses Internet Explorer
1,female,from US,uses Firefox
2,male,from Asia,uses Chrome
3,female,from Europe,uses Safari


### get_dummies() function 
Pandas is a popular Python library inspired by data frames in R. It allows easier manipulation of tabular numeric and non-numeric data. Downsides: not very intuitive, somewhat steep learning curve. For any questions you may have, Google + StackOverflow combo works well as a source of answers.

UPDATE: Turns out that Pandas has get_dummies() function which does what we’re after. The following code will replace categorical columns with their one-hot representations:
We’ll use Pandas to load the data, do some cleaning and send it to Scikit-learn’s DictVectorizer. 

In [109]:
cols_to_transform = [ 'gender','continent','web' ]
df_with_dummies = pd.get_dummies(df,columns = cols_to_transform)
df_with_dummies

Unnamed: 0,gender_female,gender_male,continent_from Asia,continent_from Europe,continent_from US,web_uses Chrome,web_uses Firefox,web_uses Internet Explorer,web_uses Safari
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [110]:
#([0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2])

### OneHotEncoder is another option. The difference is as follows:

OneHotEncoder takes as input categorical values encoded as integers - you can get them from LabelEncoder.

Label Encoder: It is used to transform non-numerical labels to numerical labels (or nominal categorical variables). 
Numerical labels are always between 0 and n_classes-1. 

In [130]:
import sklearn.preprocessing

le= sklearn.preprocessing.LabelEncoder()
a= le.fit( ["paris", "paris", "tokyo", "amsterdam"])
le.classes_
le.transform(["paris", "paris", "tokyo"])
# or directly fit and transform 
le.fit_transform( ["paris", "paris", "tokyo", "amsterdam"])

# le.inverse_transform([2,2,1])
# le.


# enc = sklearn.preprocessing.OneHotEncoder()
# #enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  
# enc.fit(df)  
#OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,handle_unknown='error', n_values='auto', sparse=True)
# enc.transform([[0, 1, 3]]).toarray()
# array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])

array([1, 1, 2, 0], dtype=int64)


# 4.3.5.  Imputing missing values before building an estimator


This example shows that imputing the missing values can give better results
than discarding the samples containing any missing value.
Imputing does not always improve the predictions, so please check via cross-validation.
Sometimes dropping rows or using marker values is more effective.

Missing values can be replaced by the mean, the median or the most frequent
value using the ``strategy`` hyper-parameter.
The median is a more robust estimator for data with high magnitude variables
which could dominate results (otherwise known as a 'long tail').

Script output::

  Score with the entire dataset = 0.56
  Score without the samples containing missing values = 0.48
  Score after imputation of the missing values = 0.55

In this case, imputing helps the classifier get close to the original score.
  


In [55]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
#from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import cross_val_score

In [56]:
rng = np.random.RandomState(0)

dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]
print n_samples, n_features
y_full.dtype

506 13


dtype('float64')

In [57]:
# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)

Score with the entire dataset = 0.56


In [65]:
s = pd.Series([1,2,3, np.nan, np.nan])
s1 = pd.Series([1,2,3, '',''])
s2 = pd.Series([1,2,3, None,None])
s2.isnull().sum()
s3= pd.DataFrame({'A':[1,2,3, None,None],'B':[4,5,6, np.nan, np.nan]})
print s3
s3.isnull().sum()


     A    B
0  1.0  4.0
1  2.0  5.0
2  3.0  6.0
3  NaN  NaN
4  NaN  NaN


A    2
B    2
dtype: int64

In [36]:
np.isnan(X_full).sum()

0

In [29]:
X_full_df= pd.DataFrame(X_full)
X_full_df.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
dtype: int64

In [41]:
# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = np.floor(n_samples * missing_rate)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
print n_missing_samples
print missing_samples.shape[0]

379.0
506




In [49]:
rng.shuffle(missing_samples)

missing_features = rng.randint(0, n_features, n_missing_samples)

missing_samples
missing_features.shape

  app.launch_new_instance()


(379L,)

In [46]:
# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

Score without the samples containing missing values = 0.54


In [54]:
# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
# print missing_samples
# print np.where(missing_samples)[0]
X_missing

array([[  6.32000000e-03,   1.80000000e+01,   2.31000000e+00, ...,
          1.53000000e+01,   3.96900000e+02,   4.98000000e+00],
       [  2.73100000e-02,   0.00000000e+00,   7.07000000e+00, ...,
          1.78000000e+01,   3.96900000e+02,   9.14000000e+00],
       [  2.72900000e-02,   0.00000000e+00,   7.07000000e+00, ...,
          0.00000000e+00,   3.92830000e+02,   4.03000000e+00],
       ..., 
       [  6.07600000e-02,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.96900000e+02,   0.00000000e+00],
       [  1.09590000e-01,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.93450000e+02,   6.48000000e+00],
       [  4.74100000e-02,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.96900000e+02,   7.88000000e+00]])

In [None]:
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)