# Ensemble/Voting Classification in Python with Scikit-Learn
ref：https://www.kaggle.com/c/titanic/submit

In [90]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier

In [91]:
training_data = pd.read_csv('Data/train.csv')
testing_data = pd.read_csv('Data/test.csv')

print(training_data.shape)
training_data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [92]:
print(testing_data.shape)
testing_data.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [93]:
def get_nulls(training, testing):
    print("Training Data:")
    print(pd.isnull(training).sum())
    print("Testing Data:")
    print(pd.isnull(testing).sum())

get_nulls(training_data, testing_data)

Training Data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Testing Data:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [94]:
# Drop the cabin column, as there are too many missing values
# Drop the ticket numbers too, as there are too many categories
# Drop names as they won't really help predict survivors
training_data.drop(labels = ['Ticket','Cabin','Name'], axis = 1,inplace=True)
testing_data.drop(labels =['Ticket','Cabin','Name'], axis = 1,inplace=True)


# Taking the mean/average value would be impacted by the skew
# so we should use the median value to impute missing values
training_data["Age"].fillna(training_data["Age"].median(),inplace=True)
testing_data["Age"].fillna(testing_data["Age"].median(),inplace=True)
training_data["Embarked"].fillna('S',inplace=True)
testing_data["Fare"].fillna(testing_data["Fare"].median(),inplace=True)


get_nulls(training_data, testing_data)

Training Data:
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
Testing Data:
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [95]:
# Fit the encoder on the data (Feature: Sex)
encoder_1 = LabelEncoder()
encoder_1.fit(training_data["Sex"])

# Transform and replace training data
training_sex_encoded = encoder_1.transform(training_data["Sex"])
training_data["Sex"] = training_sex_encoded
test_sex_encoded = encoder_1.transform(testing_data["Sex"])
testing_data["Sex"] = test_sex_encoded

# Fit the encoder on the data (Feature: Embarked)
encoder_2 = LabelEncoder()
encoder_2.fit(testing_data["Embarked"])
# encoder_3 = LabelEncoder()
# encoder_3.fit(testing_data["Embarked"])

# Transform and replace training data
training_Embarked_encoded = encoder_2.transform(training_data["Embarked"])
training_data["Embarked"] = training_Embarked_encoded
test_Embarked_encoded = encoder_2.transform(testing_data["Embarked"])
testing_data["Embarked"] = test_Embarked_encoded



# Any value we want to reshape needs be turned into array first
ages_train = np.array(training_data["Age"]).reshape(-1, 1)
ages_test = np.array(testing_data["Age"]).reshape(-1, 1)
Fare_train = np.array(training_data["Fare"]).reshape(-1, 1)
Fare_test = np.array(testing_data["Fare"]).reshape(-1, 1)

# Scaler takes arrays
scaler = StandardScaler()

training_data["Age"] = scaler.fit_transform(ages_train)
testing_data["Age"] = scaler.fit_transform(ages_test)
training_data["Fare"] = scaler.fit_transform(Fare_train)
testing_data["Fare"] = scaler.fit_transform(Fare_test)

print(training_data)
print(testing_data)


     PassengerId  Survived  Pclass  Sex       Age  SibSp  Parch      Fare  \
0              1         0       3    1 -0.565736      1      0 -0.502445   
1              2         1       1    0  0.663861      1      0  0.786845   
2              3         1       3    0 -0.258337      0      0 -0.488854   
3              4         1       1    0  0.433312      1      0  0.420730   
4              5         0       3    1  0.433312      0      0 -0.486337   
..           ...       ...     ...  ...       ...    ...    ...       ...   
886          887         0       2    1 -0.181487      0      0 -0.386671   
887          888         1       1    0 -0.796286      0      0 -0.044381   
888          889         0       3    0 -0.104637      1      2 -0.176263   
889          890         1       1    1 -0.258337      0      0 -0.044381   
890          891         0       3    1  0.202762      0      0 -0.492378   

     Embarked  
0           2  
1           0  
2           2  
3          

In [96]:
# Now to select our training/testing data
X_features = training_data.drop(labels=['PassengerId', 'Survived'], axis=1)
y_labels = training_data['Survived']

print(X_features.head(5))
print(y_labels.head(5))

# Make the train/test data from validation

X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.1,random_state=12)

   Pclass  Sex       Age  SibSp  Parch      Fare  Embarked
0       3    1 -0.565736      1      0 -0.502445         2
1       1    0  0.663861      1      0  0.786845         0
2       3    0 -0.258337      0      0 -0.488854         2
3       1    0  0.433312      1      0  0.420730         2
4       3    1  0.433312      0      0 -0.486337         2
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


## Simple Averaging Approach

In [83]:
LogReg_clf = LogisticRegression()
DTree_clf = DecisionTreeClassifier()
SVC_clf = SVC()

LogReg_clf.fit(X_train, y_train)
DTree_clf.fit(X_train, y_train)
SVC_clf.fit(X_train, y_train)

LogReg_pred = LogReg_clf.predict(X_val)
DTree_pred = DTree_clf.predict(X_val)
SVC_pred = SVC_clf.predict(X_val)

averaged_preds = (LogReg_pred + DTree_pred + SVC_pred)//3
acc = accuracy_score(y_val, averaged_preds)
print(acc)

0.8


## Bagging Classification Example

In [108]:
LogReg_Bagging = BaggingClassifier(estimator=LogReg_clf, n_estimators=50, random_state=42)
Dtree_Bagging = BaggingClassifier(estimator=DTree_clf, n_estimators=50, random_state=42) 
SVC_Bagging = BaggingClassifier(estimator=SVC_clf, n_estimators=50, random_state=42) 

def bagging_ensemble(model):
    k_folds = KFold(n_splits=20, random_state=12,shuffle=True)
    results = cross_val_score(model, X_train, y_train, cv=k_folds)
    return results.mean()

print('LogReg_bagging =', bagging_ensemble(LogReg_Bagging))
print('Dtree_bagging =', bagging_ensemble(Dtree_Bagging))
print('SVC_bagging =', bagging_ensemble(SVC_Bagging))


LogReg_bagging = 0.7939634146341463
Dtree_bagging = 0.8101219512195122
SVC_bagging = 0.8264329268292683


## Boosting Classification Example

In [85]:
k_folds = KFold(n_splits=20, random_state=42,shuffle=True)
num_estimators = [20, 40, 60, 80, 100]

for i in num_estimators:
    ada_boost = AdaBoostClassifier(n_estimators=i, random_state=42)
    results = cross_val_score(ada_boost,X_train,y_train,cv = k_folds,error_score='raise')
    print("Results for {} estimators:".format(i))
    print(results.mean())

Results for 20 estimators:
0.8115548780487807
Results for 40 estimators:
0.8128353658536586
Results for 60 estimators:
0.8153353658536586
Results for 80 estimators:
0.8140853658536585
Results for 100 estimators:
0.8115853658536585


## voting\Stacking Classification Example

In [86]:
voting_clf = VotingClassifier(estimators=[('SVC', SVC_clf), ('DTree', DTree_clf), ('LogReg', LogReg_clf)], voting='hard')
voting_clf.fit(X_train, y_train)
preds = voting_clf.predict(X_val)
acc = accuracy_score(y_val, preds)
l_loss = log_loss(y_val, preds)
f1 = f1_score(y_val, preds)

print("Accuracy is: " + str(acc))
print("Log Loss is: " + str(l_loss))
print("F1 Score is: " + str(f1))

Accuracy is: 0.8333333333333334
Log Loss is: 6.0072755648528595
F1 Score is: 0.7761194029850748


In [87]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
715,3,1,-0.796286,0,0,-0.494391,2
319,1,0,0.817561,1,1,2.059694,0
829,1,0,2.508257,0,0,0.962353,2
79,3,0,0.049062,0,0,-0.397241,2
484,1,1,-0.335187,1,0,1.185430,0
...,...,...,...,...,...,...,...
241,3,0,-0.104637,1,0,-0.336334,1
253,3,1,0.049062,1,0,-0.324253,2
390,1,1,0.510161,1,2,1.767741,2
667,3,1,-0.104637,0,0,-0.491874,2


In [98]:
testing_data_for_sub = testing_data.drop(labels = ['PassengerId'],axis =1)
testing_data_for_sub

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,0.386231,0,0,-0.497413,1
1,3,0,1.371370,1,0,-0.512278,2
2,2,1,2.553537,0,0,-0.464100,1
3,3,1,-0.204852,0,0,-0.482475,2
4,3,0,-0.598908,1,1,-0.417492,2
...,...,...,...,...,...,...,...
413,3,1,-0.204852,0,0,-0.493455,2
414,1,0,0.740881,0,0,1.314435,0
415,3,1,0.701476,0,0,-0.507796,2
416,3,1,-0.204852,0,0,-0.493455,2


In [100]:
submission = pd.DataFrame(columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = testing_data['PassengerId']
submission['Survived'] = voting_clf.predict(testing_data_for_sub)
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [104]:
submission.to_csv('submission_1.csv', index=False)

In [105]:
# SVC
submission_svc = pd.DataFrame(columns = ['PassengerId', 'Survived'])
submission_svc['PassengerId'] = testing_data['PassengerId']
submission_svc['Survived'] = SVC_clf.predict(testing_data_for_sub)
submission_svc

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [106]:
submission_svc.to_csv('submission_svc.csv', index=False)

In [None]:
SVC_Bagging

In [112]:
# SVC_bagging
submission_svc_b = pd.DataFrame(columns = ['PassengerId', 'Survived'])
submission_svc_b['PassengerId'] = testing_data['PassengerId']
SVC_Bagging.fit(X_train,y_train)
submission_svc_b['Survived'] = SVC_Bagging.predict(testing_data_for_sub)
submission_svc_b

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [113]:
submission_svc_b.to_csv('submission_svc_b.csv', index=False)