In [8]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer

In [9]:
# data loading
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
# checking data
# check info for train_data
#pd.set_option('display.max_columns', None) #display all the columns
print('check data info including: column name, the number of Nah, data type')
print(train_data.info())
print('-'*30)
print('check data description')
print(train_data.describe())
print('-'*30)
print('check discrete value distribution')
print(train_data.describe(include=['O']))
print('-'*30)
print('check the first 5 head data')
print(train_data.head())
print('-'*30)
print('check the last 5 tail data')
print(train_data.tail())

check data info including: column name, the number of Nah, data type
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
------------------------------
check data description
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  71

In [13]:
# use average age to fill nan age value

train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)
# use average fare value to fill nan fare value
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)

print(train_data['Embarked'].value_counts())
# use the most embark port value to fill nan value
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)
# feature selection
# can use mountain climbing to check how many specific features will generate a higher accurate rate
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# load relative data set to train feature, train labels and test features from train_data
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]

print('show features what u selected')
print(train_features)

# vectorize, solve problem for discrete values eg. sex: male = 0,1; female = 1,0
# estimator
dvec=DictVectorizer(sparse=False)
# convert train_features to dictionary first 
train_features=dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)

# construct ID3 decision tree by criterion = entropy
clf = DecisionTreeClassifier(criterion='entropy')
# train model
clf.fit(train_features, train_labels)

# vectorize test feature as well
test_features=dvec.transform(test_features.to_dict(orient='record'))
# predict for decsion tree
pred_labels = clf.predict(test_features)

# accurracy for train data
acc_decision_tree = round(clf.score(train_features, train_labels), 6)
print(u'score accurate rate: %.4lf' % acc_decision_tree)

# 10-fold cross validatain for train data
print(u'cross_val_score accurate rate %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))


S    646
C    168
Q     77
Name: Embarked, dtype: int64
show features what u selected
     Pclass     Sex        Age  SibSp  Parch     Fare Embarked
0         3    male  22.000000      1      0   7.2500        S
1         1  female  38.000000      1      0  71.2833        C
2         3  female  26.000000      0      0   7.9250        S
3         1  female  35.000000      1      0  53.1000        S
4         3    male  35.000000      0      0   8.0500        S
..      ...     ...        ...    ...    ...      ...      ...
886       2    male  27.000000      0      0  13.0000        S
887       1  female  19.000000      0      0  30.0000        S
888       3  female  29.699118      1      2  23.4500        S
889       1    male  26.000000      0      0  30.0000        C
890       3    male  32.000000      0      0   7.7500        Q

[891 rows x 7 columns]
['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']
score accurate rate: 0