# Titanic Challenge

<p align="center"> 
<img src="./Img/titanic.jpg">
</p>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Importing Train Dataset

train = pd.read_csv('./Files/train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Importing Test Dataset

test = pd.read_csv('./Files/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Finding out more about Train CSV

#### Variable Notes

__pclass:__ A proxy for socio-economic status (SES)

    1st = Upper
    2nd = Middle
    3rd = Lower

__age:__ Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

__sibsp:__ # of siblings / spouses aboard the Titanic. The dataset defines family relations in this way...

- Sibling = brother, sister, stepbrother, stepsister
- Spouse = husband, wife (mistresses and fiancés were ignored)

__parch:__ # of parents / children aboard the Titanic. The dataset defines family relations in this way...

- Parent = mother, father
- Child = daughter, son, stepdaughter, stepson
- Some children travelled only with a nanny, therefore parch=0 for them.

__embarked:__  Port of Embarkation.
- C = Cherbourg
- Q = Queenstown
- S = Southampton

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
train['Genre'] = enc.fit_transform(train['Sex'])
test['Genre'] = enc.fit_transform(test['Sex'])

In [6]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Genre
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0


In [7]:
test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Genre
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,1
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,0
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,1
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,0
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,1


### Feature Columns and Target Data

In [8]:
feature_cols = ['Pclass','Genre','SibSp','Parch','Fare']
X_train = train[feature_cols]
y_train = train['Survived']

In [9]:
print(X_train.shape,y_train.shape)

(891, 5) (891,)


In [10]:
# 549 People who didn't survive and 342 People who survived

y_train.value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Scikit-Learn Model

- Type of Problem: __Classification__ Problem
- Types of classification __algorithms__ in Machine Learning:
    1. Linear Classifiers: Logistic Regression, Naive Bayes Classifier
    2. Support Vector Machines
    3. Decision Trees
    4. Boosted Trees
    5. Random Forest
    6. Neural Networks
    7. Nearest Neighbor

### Logistic Regression

#### Fitting the model

In [11]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Test Dataset

In [12]:
X_test = test[feature_cols]
X_test.head()

Unnamed: 0,Pclass,Genre,SibSp,Parch,Fare
0,3,1,0,0,7.8292
1,3,0,1,0,7.0
2,2,1,0,0,9.6875
3,3,1,0,0,8.6625
4,3,0,1,1,12.2875


In [13]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
Pclass    418 non-null int64
Genre     418 non-null int64
SibSp     418 non-null int64
Parch     418 non-null int64
Fare      417 non-null float64
dtypes: float64(1), int64(4)
memory usage: 16.4 KB


In [14]:
X_test[(X_test['Pclass'] == 3) & (X_test['Genre'] == 1) & (X_test['SibSp'] == 0) & (X_test['Parch'] == 0)]['Fare'].mean()

8.613628971962616

In [15]:
X_test['Fare'].fillna(8.6136,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [16]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
Pclass    418 non-null int64
Genre     418 non-null int64
SibSp     418 non-null int64
Parch     418 non-null int64
Fare      418 non-null float64
dtypes: float64(1), int64(4)
memory usage: 16.4 KB


#### Predicting values

In [17]:
y_pred = logreg.predict(X_test)

In [18]:
# Construimos un DataFrame
# Diccionarios son unordered por esa razón ponemos un set_index con passengerid

titanic_kaggle = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived': y_pred}).set_index('PassengerId')

In [19]:
titanic_kaggle.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [20]:
#Guardar el df en un archivo CSV

titanic_kaggle.to_csv('sub_Titanic_LogReg.csv')

### KNN KNeighbors

In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
param_grid=dict(n_neighbors=range(1,31),weights=['uniform','distance'])
grid=GridSearchCV(knn,param_grid,cv=10,scoring='accuracy',return_train_score=False)
grid.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': range(1, 31), 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

In [22]:
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

Unnamed: 0,mean_test_score,std_test_score,params
0,0.739618,0.056052,"{'n_neighbors': 1, 'weights': 'uniform'}"
1,0.739618,0.056052,"{'n_neighbors': 1, 'weights': 'distance'}"
2,0.735129,0.059854,"{'n_neighbors': 2, 'weights': 'uniform'}"
3,0.754209,0.058634,"{'n_neighbors': 2, 'weights': 'distance'}"
4,0.75982,0.059395,"{'n_neighbors': 3, 'weights': 'uniform'}"
5,0.768799,0.045417,"{'n_neighbors': 3, 'weights': 'distance'}"
6,0.738496,0.03911,"{'n_neighbors': 4, 'weights': 'uniform'}"
7,0.784512,0.037256,"{'n_neighbors': 4, 'weights': 'distance'}"
8,0.762065,0.036644,"{'n_neighbors': 5, 'weights': 'uniform'}"
9,0.787879,0.032649,"{'n_neighbors': 5, 'weights': 'distance'}"


In [23]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.7934904601571269
{'n_neighbors': 26, 'weights': 'distance'}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=26, p=2,
           weights='distance')


In [24]:
knn = KNeighborsClassifier(n_neighbors=9, weights='distance')
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='distance')

In [25]:
y_pred = knn.predict(X_test)

In [26]:
# Construimos un DataFrame
# Diccionarios son unordered por esa razón ponemos un set_index con passengerid

titanic_kaggle = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived': y_pred}).set_index('PassengerId')

In [27]:
#Guardar el df en un archivo CSV

titanic_kaggle.to_csv('sub_Titanic_KNN9DIST.csv')

### Support Vector Machine

In [33]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [34]:
y_pred = svc.predict(X_test)

In [35]:
# Construimos un DataFrame
# Diccionarios son unordered por esa razón ponemos un set_index con passengerid

titanic_kaggle = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived': y_pred}).set_index('PassengerId')

In [36]:
titanic_kaggle.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [37]:
#Guardar el df en un archivo CSV

#titanic_kaggle.to_csv('./Submissions/sub_Titanic_SVC.csv')