In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import time



In [2]:
X_train=pd.read_csv('X_train_dummy.csv')
y_train=pd.read_csv('y_train_dummy.csv',header=None)
X_prediction = pd.read_csv('X_prediction_dummy.csv')
y_prediction=pd.read_csv('y_prediction_dummy.csv',header=None)
y_train = np.ravel(y_train)
list(X_train.columns.values)     # Features available

['Title_ Col',
 'Title_ Don',
 'Title_ Dona',
 'Title_ Dr',
 'Title_ Jonkheer',
 'Title_ Lady',
 'Title_ Major',
 'Title_ Master',
 'Title_ Miss',
 'Title_ Mlle',
 'Title_ Mme',
 'Title_ Mr',
 'Title_ Mrs',
 'Title_ Ms',
 'Title_ Rev',
 'Title_ Sir',
 'Title_ the Countess',
 'Companion',
 'Sex_male',
 'Pclass',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_Q',
 'Embarked_S',
 'Married_Woman',
 'Age_Classe_estimate']

In [3]:

# ********************. Feature extraction by logistic regression ****************************
model =LogisticRegression()
rfe = RFE(model, 10)

fit = rfe.fit(X_train, y_train)

print("Num Features: %d"% fit.n_features_) 
print("Selected Features: %s"% fit.support_) 
print("Feature Ranking: %s"% fit.ranking_)

titles=['Title_ Col', 'Title_ Don', 'Title_ Dona', 'Title_ Dr', 'Title_ Jonkheer', 'Title_ Lady', 'Title_ Major', 'Title_ Master', 'Title_ Miss', 'Title_ Mlle',
 'Title_ Mme', 'Title_ Mr', 'Title_ Mrs', 'Title_ Ms', 'Title_ Rev', 'Title_ Sir', 'Title_ the Countess', 'Companion', 'Sex_male', 'Pclass', 'SibSp',
 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S', 'Married_Woman', 'Age_Classe_estimate']
drop_list=[]
for i in range(0,26):
    if fit.support_[i]==False:
        drop_list.append(titles[i])
drop_list

Num Features: 10
Selected Features: [False  True False False  True False False  True  True False False  True
  True False  True False False False  True  True  True False False False
 False False False]
Feature Ranking: [17  1 18  9  1 10 15  1  1  7 11  1  1  5  1  2 13 16  1  1  1  8 14 12
  6  3  4]


['Title_ Col',
 'Title_ Dona',
 'Title_ Dr',
 'Title_ Lady',
 'Title_ Major',
 'Title_ Mlle',
 'Title_ Mme',
 'Title_ Ms',
 'Title_ Sir',
 'Title_ the Countess',
 'Companion',
 'Parch',
 'Fare',
 'Embarked_Q',
 'Embarked_S',
 'Married_Woman']

In [4]:
X_train=X_train.drop(drop_list, axis=1)   # get rid of unselected features
X_prediction=X_prediction.drop(drop_list, axis=1)

In [5]:
X_train.head()

Unnamed: 0,Title_ Don,Title_ Jonkheer,Title_ Master,Title_ Miss,Title_ Mr,Title_ Mrs,Title_ Rev,Sex_male,Pclass,SibSp,Age_Classe_estimate
0,0,0,0,0,1,0,0,1,3,1,1
1,0,0,0,0,0,1,0,0,1,1,2
2,0,0,0,1,0,0,0,0,3,0,1
3,0,0,0,0,0,1,0,0,1,1,2
4,0,0,0,0,1,0,0,1,3,0,2


In [6]:
# ***************** Logistic regression  *********************

In [10]:
param_grid={'C':[0.01,0.1,1,10]}
grid_searchLR=GridSearchCV(LogisticRegression(),param_grid,cv=10)
grid_searchLR.fit(X_train,y_train)
LR = grid_searchLR.best_estimator_
grid_searchLR.best_params_,grid_searchLR.best_score_

({'C': 1}, 0.8204264870931538)

In [None]:
# ***************** Support vector machine ********************

In [11]:
param_grid={'C':[1,5,6,7,8,9,10,11,13,15,20,50,100]}
grid_searchSVM=GridSearchCV(SVC(gamma='auto'),param_grid,cv=10)
grid_searchSVM.fit(X_train,y_train)
SVM = grid_searchSVM.best_estimator_
grid_searchSVM.best_params_,grid_searchSVM.best_score_

({'C': 7}, 0.8226711560044894)

In [None]:
# ************** Neural Nestwork ***************************

In [12]:
activation = ['identity', 'logistic', 'tanh', 'relu']      # selection activation function 
param_grid = dict(activation=activation)
grid_searchNN=GridSearchCV(MLPClassifier(max_iter=10000),param_grid,cv=10)
grid_searchNN.fit(X_train,y_train)
grid_searchNN.best_params_,grid_searchNN.best_score_

({'activation': 'relu'}, 0.8159371492704826)

In [25]:
param_grid={'alpha':[0.00005, 0.00007, 0.0001, 0.00015, 0.0002, 0.0003, 0.0005]}     # Select penalty 
grid_searchNN=GridSearchCV(MLPClassifier(activation='relu',max_iter=10000),param_grid,cv=10)
grid_searchNN.fit(X_train,y_train)
NN = grid_searchNN.best_estimator_
grid_searchNN.best_params_,grid_searchNN.best_score_

({'alpha': 0.00015}, 0.8159371492704826)

In [14]:
# select number of hidden layers
param_grid = {'hidden_layer_sizes':[(11,11,11), (11,11,11,11), (11,11,11,11,11),(11,11,11,11,11,11)]}
grid_searchNN=GridSearchCV(MLPClassifier(activation='relu',max_iter=10000),param_grid,cv=10)
grid_searchNN.fit(X_train,y_train)

grid_searchNN.best_params_,grid_searchNN.best_score_

({'hidden_layer_sizes': (11, 11, 11, 11, 11, 11)}, 0.8148148148148148)

In [None]:
# ***************** K nearest neighboor ********************

In [15]:
param_grid = {'n_neighbors':np.arange(2,20)}
grid_searchKNN=GridSearchCV(KNeighborsClassifier(),param_grid,cv=10)
grid_searchKNN.fit(X_train,y_train)
grid_searchKNN.best_params_,grid_searchKNN.best_score_

({'n_neighbors': 6}, 0.8260381593714927)

In [16]:
weights = ['uniform', 'distance']      # 
param_grid = dict(weights=weights)
grid_searchKNN=GridSearchCV(KNeighborsClassifier(n_neighbors = 6),param_grid,cv=10)
grid_searchKNN.fit(X_train,y_train)
grid_searchKNN.best_params_,grid_searchKNN.best_score_

({'weights': 'uniform'}, 0.8260381593714927)

In [17]:
algorithm = ['auto', 'ball_tree','kd_tree','brute']      # 
param_grid = dict(algorithm=algorithm)
grid_searchKNN=GridSearchCV(KNeighborsClassifier(n_neighbors = 6),param_grid,cv=10)
grid_searchKNN.fit(X_train,y_train)
KNN = grid_searchKNN.best_estimator_
grid_searchKNN.best_params_,grid_searchKNN.best_score_

({'algorithm': 'auto'}, 0.8260381593714927)

In [None]:
# ********************* Combined *******************************

In [26]:
#Combined
eclf1 = VotingClassifier(estimators=[ ('mlp', NN), ('svm', SVM),('knn',KNN)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
scores = cross_val_score(eclf1, X_train, y_train, cv=10)
print('eclf1',np.mean(scores))

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


eclf1 0.8192719328112587


  if diff:
  if diff:


In [None]:
# ***************** Write into CSV **********************

In [27]:
predictions=eclf1.predict(X_prediction)
y_prediction = predictions
ID = np.linspace (892,1309, 418, dtype = np.int32) 
dictionary = {'PassengerId': ID, 'Survived': y_prediction} 
y_prediction_final = pd.DataFrame(dictionary)
y_prediction_final
y_prediction_final.to_csv ("predicted_class_eclf.csv", index = False)

  if diff:


In [21]:
predictions=SVM.predict(X_prediction)
y_prediction = predictions
ID = np.linspace (892,1309, 418, dtype = np.int32) 
dictionary = {'PassengerId': ID, 'Survived': y_prediction} 
y_prediction_final = pd.DataFrame(dictionary)
y_prediction_final
y_prediction_final.to_csv ("predicted_class_svm.csv", index = False)

In [28]:
predictions=NN.predict(X_prediction)
y_prediction = predictions
ID = np.linspace (892,1309, 418, dtype = np.int32) 
dictionary = {'PassengerId': ID, 'Survived': y_prediction} 
y_prediction_final = pd.DataFrame(dictionary)
y_prediction_final
y_prediction_final.to_csv ("predicted_class_mlp.csv", index = False)

In [23]:
predictions=LR.predict(X_prediction)
y_prediction = predictions
ID = np.linspace (892,1309, 418, dtype = np.int32) 
dictionary = {'PassengerId': ID, 'Survived': y_prediction} 
y_prediction_final = pd.DataFrame(dictionary)
y_prediction_final
y_prediction_final.to_csv ("predicted_class_lr.csv", index = False)

In [24]:
predictions=KNN.predict(X_prediction)
y_prediction = predictions
ID = np.linspace (892,1309, 418, dtype = np.int32) 
dictionary = {'PassengerId': ID, 'Survived': y_prediction} 
y_prediction_final = pd.DataFrame(dictionary)
y_prediction_final
y_prediction_final.to_csv ("predicted_class_knn.csv", index = False)