# Feature extraction

The source data does not usually come in a correct format. We have to extract what we think are potentially useful features and convert them to our learning format. This process is called feature extraction or feature engineering.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
titanic = pd.read_csv('./titanic.csv')

In [3]:
titanic.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [37]:
titanic.shape

(1313, 11)

In [38]:
print titanic.columns

Index([u'row.names', u'pclass', u'survived', u'name', u'age', u'embarked',
       u'home.dest', u'room', u'ticket', u'boat', u'sex'],
      dtype='object')


In [39]:
print titanic.index

RangeIndex(start=0, stop=1313, step=1)


In [10]:
from sklearn import feature_extraction

In [42]:
def one_hot_dataframe(data, cols, replace=False):
    vec = feature_extraction.DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData

In [43]:
titanic, titanic_n = one_hot_dataframe(titanic, ['pclass','embarked','sex'], replace=True)

In [47]:
titanic.describe()

Unnamed: 0,row.names,survived,age,embarked,embarked=Cherbourg,embarked=Queenstown,embarked=Southampton,pclass=1st,pclass=2nd,pclass=3rd,sex=female,sex=male
count,1313.0,1313.0,633.0,821.0,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0
mean,657.0,0.341965,31.194181,0.0,0.154608,0.034273,0.436405,0.24524,0.213252,0.541508,0.352628,0.647372
std,379.174762,0.474549,14.747525,0.0,0.361668,0.181998,0.496128,0.430393,0.40976,0.498464,0.47797,0.47797
min,1.0,0.0,0.1667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,329.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,657.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,985.0,1.0,41.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
max,1313.0,1.0,71.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [50]:
titanic, titanic_n = one_hot_dataframe(titanic, ['home.dest', 'room', 'ticket', 'boat'], replace=True)

In [51]:
titanic.describe()

Unnamed: 0,row.names,survived,age,embarked,embarked=Cherbourg,embarked=Queenstown,embarked=Southampton,pclass=1st,pclass=2nd,pclass=3rd,...,ticket=248744 L13,ticket=248749 L13,ticket=250647,ticket=27849,ticket=28220 L32 10s,ticket=34218 L10 10s,ticket=36973 L83 9s 6d,ticket=392091,ticket=7076,ticket=L15 1s
count,1313.0,1313.0,633.0,821.0,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0,...,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0,1313.0
mean,657.0,0.341965,31.194181,0.0,0.154608,0.034273,0.436405,0.24524,0.213252,0.541508,...,0.000762,0.000762,0.000762,0.000762,0.002285,0.000762,0.001523,0.001523,0.000762,0.000762
std,379.174762,0.474549,14.747525,0.0,0.361668,0.181998,0.496128,0.430393,0.40976,0.498464,...,0.027597,0.027597,0.027597,0.027597,0.047764,0.027597,0.039014,0.039014,0.027597,0.027597
min,1.0,0.0,0.1667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,329.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,657.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,985.0,1.0,41.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1313.0,1.0,71.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
mean = titanic['age'].mean()

In [57]:
titanic['age'].fillna(mean, inplace=True)
titanic.fillna(0, inplace = True)

In [58]:
titanic.head()

Unnamed: 0,row.names,survived,name,age,embarked,embarked=Cherbourg,embarked=Queenstown,embarked=Southampton,pclass=1st,pclass=2nd,...,ticket=248744 L13,ticket=248749 L13,ticket=250647,ticket=27849,ticket=28220 L32 10s,ticket=34218 L10 10s,ticket=36973 L83 9s 6d,ticket=392091,ticket=7076,ticket=L15 1s
0,1,1,"Allen, Miss Elisabeth Walton",29.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,"Allison, Miss Helen Loraine",2.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,"Allison, Mr Hudson Joshua Creighton",30.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1,"Allison, Master Hudson Trevor",0.9167,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split data into train and test

In [59]:
from sklearn.cross_validation import train_test_split

In [60]:
titanic_target = titanic['survived']

In [62]:
titanic_data = titanic.drop(['name','row.names', 'survived'], axis=1)

In [67]:
X_train, X_test, y_train, y_test = train_test_split(titanic_data, titanic_target, test_size = 0.25, random_state=33)

# Train model use training data

In [68]:
from sklearn.tree import DecisionTreeClassifier

In [69]:
dtc = DecisionTreeClassifier(criterion='entropy')

In [70]:
dtc = dtc.fit(X_train, y_train)

# Evaluate model use test data

In [71]:
from sklearn import metrics

In [72]:
y_pred = dtc.predict(X_test)

In [73]:
print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test, y_pred))

Accuracy:0.830


In [75]:
titanic_n[0]

KeyError: 0