In [151]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
trainData = pd.read_csv('../data/train.csv')

In [3]:
trainData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
trainData.info()

In [4]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

In [6]:
trainData['SexCode'] = label_encoder.fit_transform(trainData['Sex'])
trainData['EmbarkedCode'] = label_encoder.fit_transform(trainData['Embarked'])

In [7]:
trainData['Pclass'].unique()

array([3, 1, 2])

In [12]:
trainData.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexCode,EmbarkedCode
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,2


In [133]:
# trainData['Age']
trainData['Age'].fillna((trainData['Age'].mean()), inplace=True)

In [138]:
trainData['Age'] = trainData['Age'].apply(lambda age: int(age))
trainData['Age']

0      22
1      38
2      26
3      35
4      35
       ..
886    27
887    19
888    29
889    26
890    32
Name: Age, Length: 891, dtype: int64

In [139]:
filteredTrainData = pd.DataFrame(
    columns=[
        'Survived', 'Age','Pclass', 'SibSp', 'Parch', 'SexCode', 'EmbarkedCode'
    ]
)
filteredTrainData['Survived'] = trainData['Survived']
filteredTrainData['Age'] = trainData['Age']
filteredTrainData['Pclass'] = trainData['Pclass']
filteredTrainData['SibSp'] = trainData['SibSp']
filteredTrainData['Parch'] = trainData['Parch']
filteredTrainData['SexCode'] = trainData['SexCode']
filteredTrainData['EmbarkedCode'] = trainData['EmbarkedCode']

In [140]:
# filteredTrainData.iloc[:,0]
filteredTrainData.head()

Unnamed: 0,Survived,Age,Pclass,SibSp,Parch,SexCode,EmbarkedCode
0,0,22,3,1,0,1,2
1,1,38,1,1,0,0,0
2,1,26,3,0,0,0,2
3,1,35,1,1,0,0,2
4,0,35,3,0,0,1,2


### Put data values into same scale

In [152]:
from sklearn.preprocessing import StandardScaler

In [153]:
# Adjust the scale of the values in the columns
#StandardScaler
df_scaled = pd.DataFrame(StandardScaler().fit_transform(filteredTrainData), columns=filteredTrainData.columns)

In [154]:
df_scaled.head()

Unnamed: 0,Survived,Age,Pclass,SibSp,Parch,SexCode,EmbarkedCode
0,-0.789272,-0.580044,0.827377,0.432793,-0.473674,0.737695,0.581114
1,1.26699,0.650112,-1.566107,0.432793,-0.473674,-1.355574,-1.93846
2,1.26699,-0.272505,0.827377,-0.474545,-0.473674,-1.355574,0.581114
3,1.26699,0.419458,-1.566107,0.432793,-0.473674,-1.355574,0.581114
4,-0.789272,0.419458,0.827377,-0.474545,-0.473674,0.737695,0.581114


### Split Train Test Data

In [143]:
# Trait Test Data
x = filteredTrainData.iloc[:,1:7].values
y = filteredTrainData.iloc[:,0]

In [146]:
x

array([[22,  3,  1,  0,  1,  2],
       [38,  1,  1,  0,  0,  0],
       [26,  3,  0,  0,  0,  2],
       ...,
       [29,  3,  1,  2,  0,  2],
       [26,  1,  0,  0,  1,  0],
       [32,  3,  0,  0,  1,  1]])

In [147]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [148]:
len(x_train)

668

### DecisionTree Model

In [149]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(x_train,y_train)

In [150]:
# Check model accuracy
predictions = decision_tree_model.predict(x_test)

confusao = confusion_matrix(y_test, predictions)

modelAccuracy = accuracy_score(y_test, predictions)
modelAccuracy

0.7668161434977578

### SVM Model

In [58]:
from sklearn.svm import SVC

In [96]:
# SVM train and test
svm = SVC(kernel='linear')
svm.fit(x_train, y_train)

# show model results
smv_predictions = svm.predict(x_test)

model_accuracy = accuracy_score(y_test, smv_predictions)
model_accuracy

0.7802690582959642

### RandomForestClassifier Model

In [72]:
from sklearn.ensemble import RandomForestClassifier

In [92]:
modelForest = RandomForestClassifier(n_estimators = 100)
modelForest.fit(x_train, y_train)

forest_predictions = modelForest.predict(x_test)
forest_model_accuracy = accuracy_score(y_test, forest_predictions)
forest_model_accuracy

0.7892376681614349

### NaiveBayes Model

In [75]:
from sklearn.naive_bayes import GaussianNB

In [83]:
naive_bayes = GaussianNB()
naive_bayes.fit(x_train, y_train)

naive_predictions = naive_bayes.predict(x_test)
naive_model_accuracy = accuracy_score(y_test, naive_predictions)
naive_model_accuracy

0.7723880597014925

### Test Model

In [26]:
testData = pd.read_csv('../data/test.csv')

In [28]:
testData['SexCode'] = label_encoder.fit_transform(testData['Sex'])
testData['EmbarkedCode'] = label_encoder.fit_transform(testData['Embarked'])

In [31]:
testData.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexCode,EmbarkedCode
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,2


In [32]:
filteredTestData = pd.DataFrame(
    columns=[
        'Pclass', 'SibSp', 'Parch', 'SexCode', 'EmbarkedCode'
    ]
)

filteredTestData['Pclass'] = testData['Pclass']
filteredTestData['SibSp'] = testData['SibSp']
filteredTestData['Parch'] = testData['Parch']
filteredTestData['SexCode'] = testData['SexCode']
filteredTestData['EmbarkedCode'] = testData['EmbarkedCode']

In [33]:
filteredTestData.head()

Unnamed: 0,Pclass,SibSp,Parch,SexCode,EmbarkedCode
0,3,0,0,1,1
1,3,1,0,0,2
2,2,0,0,1,1
3,3,0,0,1,2
4,3,1,1,0,2


In [100]:
# Test Data
x_test = filteredTestData.iloc[:,0:6].values

In [101]:
x_test

array([[3, 0, 0, 1, 1],
       [3, 1, 0, 0, 2],
       [2, 0, 0, 1, 1],
       ...,
       [3, 0, 0, 1, 2],
       [3, 0, 0, 1, 2],
       [3, 1, 1, 1, 0]])

In [102]:
# Accuracy of the model
predictions = model.predict(x_test)

In [103]:
len(predictions)

418

In [104]:
testData['Survived'] = predictions

In [105]:
testData.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexCode,EmbarkedCode,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,2,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,2,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,2,1


### Buil CSV Result

In [106]:
resultData = pd.DataFrame(
    columns=[
        'PassengerId','Pclass', 'SibSp', 'Parch', 'SexCode', 'EmbarkedCode', 'Survived'
    ]
)

resultData['PassengerId'] = testData['PassengerId']
resultData['Pclass'] = testData['Pclass']
resultData['SibSp'] = testData['SibSp']
resultData['Parch'] = testData['Parch']
resultData['SexCode'] = testData['SexCode']
resultData['EmbarkedCode'] = testData['EmbarkedCode']
resultData['Survived'] = predictions

In [107]:
resultData.head()

Unnamed: 0,PassengerId,Pclass,SibSp,Parch,SexCode,EmbarkedCode,Survived
0,892,3,0,0,1,1,0
1,893,3,1,0,0,2,0
2,894,2,0,0,1,1,0
3,895,3,0,0,1,2,0
4,896,3,1,1,0,2,1
