# Exemple: Neural Network and XGBoost

In [26]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import numpy as np

## CSV imports

In [2]:
train = pd.read_csv(open('train.csv', newline=''))
test = pd.read_csv(open('test.csv', newline=''))

train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Cleaning unwanted columns

In [3]:
train = train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test = test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.0750,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


## Correcting categorical features

In [4]:
pclass = pd.get_dummies(train['Pclass'])
pclass = pclass.rename(columns={1: 'primeira', 2: 'segunda', 3: 'terceira'})

embarked = pd.get_dummies(train['Embarked'])
embarked = embarked.rename(columns={'C':'Cherbourg', 'Q':'Queenstown', 'S':'Southampton'})

sex = pd.get_dummies(train['Sex'])

train = pd.merge(left = train, right = pclass, left_index = True, right_index = True)
train = pd.merge(left = train, right = embarked, left_index = True, right_index = True)
train = pd.merge(left = train, right = sex, left_index = True, right_index = True)

train = train.drop(['Pclass','Embarked','Sex'], axis=1)

train

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,primeira,segunda,terceira,Cherbourg,Queenstown,Southampton,female,male
0,0,22.0,1,0,7.2500,0,0,1,0,0,1,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0,0,1,0
2,1,26.0,0,0,7.9250,0,0,1,0,0,1,1,0
3,1,35.0,1,0,53.1000,1,0,0,0,0,1,1,0
4,0,35.0,0,0,8.0500,0,0,1,0,0,1,0,1
5,0,,0,0,8.4583,0,0,1,0,1,0,0,1
6,0,54.0,0,0,51.8625,1,0,0,0,0,1,0,1
7,0,2.0,3,1,21.0750,0,0,1,0,0,1,0,1
8,1,27.0,0,2,11.1333,0,0,1,0,0,1,1,0
9,1,14.0,1,0,30.0708,0,1,0,1,0,0,1,0


## Do the same with the test dataset

In [5]:
pclass = pd.get_dummies(test['Pclass'])
pclass = pclass.rename(columns={1: 'primeira', 2: 'segunda', 3: 'terceira'})

embarked = pd.get_dummies(test['Embarked'])
embarked = embarked.rename(columns={'C':'Cherbourg', 'Q':'Queenstown', 'S':'Southampton'})

sex = pd.get_dummies(test['Sex'])

test = pd.merge(left = test, right = pclass, left_index = True, right_index = True)
test = pd.merge(left = test, right = embarked, left_index = True, right_index = True)
test = pd.merge(left = test, right = sex, left_index = True, right_index = True)

test = test.drop(['Pclass','Embarked','Sex'], axis=1)

## Fill out the NaN values

In [6]:
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)
test['Fare'].fillna(test['Fare'].mean(), inplace=True)
test

Unnamed: 0,Age,SibSp,Parch,Fare,primeira,segunda,terceira,Cherbourg,Queenstown,Southampton,female,male
0,34.50000,0,0,7.8292,0,0,1,0,1,0,0,1
1,47.00000,1,0,7.0000,0,0,1,0,0,1,1,0
2,62.00000,0,0,9.6875,0,1,0,0,1,0,0,1
3,27.00000,0,0,8.6625,0,0,1,0,0,1,0,1
4,22.00000,1,1,12.2875,0,0,1,0,0,1,1,0
5,14.00000,0,0,9.2250,0,0,1,0,0,1,0,1
6,30.00000,0,0,7.6292,0,0,1,0,1,0,1,0
7,26.00000,1,1,29.0000,0,1,0,0,0,1,0,1
8,18.00000,0,0,7.2292,0,0,1,1,0,0,1,0
9,21.00000,2,0,24.1500,0,0,1,0,0,1,0,1


## Implementation of XGBoost

In [40]:
results = pd.read_csv(open('gender_submission.csv', newline=''))

X_train = train.drop(['Survived'], axis=1)
y_train = train['Survived']

X_test = test

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=results['Survived'])

param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.003,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

bst = xgb.train(param, dtrain, num_round)

preds = bst.predict(dtest)

best_preds = np.asarray([np.argmax(line) for line in preds])

best_preds

results['Survived'] = best_preds
results
results.to_csv('results_xgboost.csv', index=False)



## Implementation of Neural Network

In [33]:
results = pd.read_csv(open('gender_submission.csv', newline=''))

X_train = train.drop(['Survived'], axis=1)
y_train = train['Survived']

X_test = test

clf = MLPClassifier(activation= 'relu', solver='sgd', alpha=1e-5,hidden_layer_sizes=(16,16), random_state=1)

clf.fit(X_train, y_train)

myResults = clf.predict(X_test)

results['Survived'] = myResults
results
results.to_csv('results_neuralnetwork.csv', index=False)

0.787081339713


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0
