Imports

In [258]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

Load & analysis

In [259]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_id = test["PassengerId"]

In [260]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


We can see from the table above that are nan values. 

we can also acertain that name, passengerID, ticket aren't meaninful features one could use to predict if someone survived.

Cabin feature also goes out the window because there are multiple NaN values, which we cannot make an imputation of the missing values.

In [261]:
# We can't make the assumption that NaNs can be babies/children, because there are a lot of unaccompanied people with NaN for age
train[train['Age'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [262]:
train = train.drop(columns=["PassengerId", "Name", "Ticket", "Cabin", "SibSp", "Parch"])
test = test.drop(columns=["PassengerId", "Name", "Ticket", "Cabin", "SibSp", "Parch"])
train

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.2500,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.9250,S
3,1,1,female,35.0,53.1000,S
4,0,3,male,35.0,8.0500,S
...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,S
887,1,1,female,19.0,30.0000,S
888,0,3,female,,23.4500,S
889,1,1,male,26.0,30.0000,C


In [263]:
# There are NaNs in Age and Embarked
train.isna().value_counts()

Survived  Pclass  Sex    Age    Fare   Embarked
False     False   False  False  False  False       712
                         True   False  False       177
                         False  False  True          2
Name: count, dtype: int64

In [264]:
# We can delete NaNs in Embarked because they are only 2
train = train.dropna(subset=["Embarked"])

In [265]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
train[["Age"]] = imputer.fit_transform(train[["Age"]])
#train[["Fare"]] = imputer.fit_transform(train[["Fare"]])
test[["Age"]] = imputer.fit_transform(test[["Age"]])
test[["Fare"]] = imputer.fit_transform(test[["Fare"]])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [266]:
# Embarked locations encoding S=2 Q=1 C=0 
fig = px.bar(x=train["Embarked"], y=train["Survived"])
fig.show()

In [267]:
fig = px.scatter(x=train.index, y=train["Fare"])
fig.show()

In [268]:
# Binning Fare values
train["Fare"] = pd.cut(train["Fare"], bins=[-1, 20, 100, 600], labels=["low", "medium", "high"])
test["Fare"] = pd.cut(test["Fare"], bins=[-1, 20, 100, 600], labels=["low", "medium", "high"])
train["Age"] = pd.cut(train["Age"], bins=[-1, 18, 35, 65, 100], labels=["child", "young-adult", "adult", "elderly"])
test["Age"] = pd.cut(test["Age"], bins=[-1, 18, 35, 65, 100], labels=["child", "young-adult", "adult", "elderly"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [269]:
# Encoding
encoder = OrdinalEncoder()
train["Sex"] = encoder.fit_transform(train[["Sex"]])
train["Embarked"] = encoder.fit_transform(train[["Embarked"]])
train["Fare"] = encoder.fit_transform(train[["Fare"]])
train["Age"] = encoder.fit_transform(train[["Age"]])
test["Sex"] = encoder.fit_transform(test[["Sex"]])
test["Embarked"] = encoder.fit_transform(test[["Embarked"]])
test["Fare"] = encoder.fit_transform(test[["Fare"]])
test["Age"] = encoder.fit_transform(test[["Age"]])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [270]:
encoder.categories_

[array(['adult', 'child', 'elderly', 'young-adult'], dtype=object)]

In [271]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,1.0,3.0,1.0,2.0
1,1,1,0.0,0.0,2.0,0.0
2,1,3,0.0,3.0,1.0,2.0
3,1,1,0.0,3.0,2.0,2.0
4,0,3,1.0,3.0,1.0,2.0
...,...,...,...,...,...,...
886,0,2,1.0,3.0,1.0,2.0
887,1,1,0.0,3.0,2.0,2.0
888,0,3,0.0,3.0,2.0,2.0
889,1,1,1.0,3.0,2.0,0.0


In [272]:
# fazer train test split
train_x = train[["Pclass", "Sex", "Age", "Fare", "Embarked"]] #
train_y = train["Survived"]

In [273]:
gridCV = GridSearchCV(SVC(), param_grid={'kernel':('linear', 'rbf', 'sigmoid'), 'C':[1, 5, 10], 'gamma': ['auto', 'scale']})
gridCV.fit(train_x, train_y)
gridCV.best_params_

{'C': 5, 'gamma': 'auto', 'kernel': 'rbf'}

In [274]:
svm = SVC(C=gridCV.best_params_['C'], gamma=gridCV.best_params_['gamma'], kernel=gridCV.best_params_['kernel'])
svm.fit(train_x, train_y)

In [275]:
result = svm.predict(test)

In [276]:
""" Random Forests -> worse than SVM
gridCV = GridSearchCV(RandomForestClassifier(), param_grid={'criterion':("gini", "entropy", "log_loss"), 'max_depth':[10, 50, 100]})
gridCV.fit(train_x, train_y)
gridCV.best_params_
forest = RandomForestClassifier(criterion=gridCV.best_params_['criterion'], max_depth=gridCV.best_params_['max_depth'])
forest.fit(train_x, train_y)
result = forest.predict(test)
"""

' Random Forests -> worse than SVM\ngridCV = GridSearchCV(RandomForestClassifier(), param_grid={\'criterion\':("gini", "entropy", "log_loss"), \'max_depth\':[10, 50, 100]})\ngridCV.fit(train_x, train_y)\ngridCV.best_params_\nforest = RandomForestClassifier(criterion=gridCV.best_params_[\'criterion\'], max_depth=gridCV.best_params_[\'max_depth\'])\nforest.fit(train_x, train_y)\nresult = forest.predict(test)\n'

In [277]:
submission = pd.DataFrame({'PassengerId':test_id,'Survived':result})
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


This submission gives a score of 0.77751

In [278]:
filename = 'titanic_result.csv'
submission.to_csv(filename,index=False)