
[![AnalyticsDojo](https://github.com/rpi-techfundamentals/spring2019-materials/blob/master/fig/final-logo.png?raw=1)](http://rpi.analyticsdojo.com)
<center><h1>Titanic Classification</h1></center>
<center><h3><a href = 'http://introml.analyticsdojo.com'>introml.analyticsdojo.com</a></h3></center>



# Titanic Classification

As an example of how to work with both categorical and numerical data, we will perform survival predicition for the passengers of the HMS Titanic.


In [20]:
import os
import pandas as pd
# We are given an existing train/test split
train = pd.read_csv('https://raw.githubusercontent.com/rpi-techfundamentals/spring2019-materials/master/input/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/rpi-techfundamentals/spring2019-materials/master/input/test.csv')

print("Train columns:", train.columns)
print("Train row count:", len(train), f"({round(len(train) / (len(train) + len(test)) * 100.0)}%)")
print()
print("Test columns, no 'Survived'", test.columns)
print("Test row count:", len(test), f"({round(len(test) / (len(train) + len(test)) * 100.0)}%)")

Train columns: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Train row count: 891 (68%)

Test columns, no 'Survived' Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Test row count: 418 (32%)


Here is a broad description of the keys and what they mean:

```
pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
survival        Survival
                (0 = No; 1 = Yes)
name            Name
sex             Sex
age             Age
sibsp           Number of Siblings/Spouses Aboard
parch           Number of Parents/Children Aboard
ticket          Ticket Number
fare            Passenger Fare
cabin           Cabin
embarked        Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)
boat            Lifeboat
body            Body Identification Number
home.dest       Home/Destination
```

In general, it looks like `name`, `sex`, `cabin`, `embarked`, `boat`, `body`, and `homedest` may be candidates for categorical features, while the rest appear to be numerical features. We can also look at the first couple of rows in the dataset to get a better understanding:

In [21]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Preprocessing function

We want to create a preprocessing function that can address transformation of our train and test set.  

In [22]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

cat_features = ['Pclass', 'Sex', 'Embarked']
num_features =  [ 'Age', 'SibSp', 'Parch', 'Fare'  ]
def preprocess(df, num_features, cat_features, dv):
    features = cat_features + num_features
    if dv in df.columns:
      y = df[dv]
    else:
      y = None
    #Address missing variables
    print("Total missing values before processing:", df[features].isna().sum().sum() )
  
    imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    df[cat_features]=imp_mode.fit_transform(df[cat_features] )
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    df[num_features]=imp_mean.fit_transform(df[num_features])
    # Does standard-scaling the numeric features improve accuracy?
    # df[num_features] = StandardScaler().fit_transform(df[num_features])
    print("Total missing values after processing:", df[features].isna().sum().sum() )
   
    X = pd.get_dummies(df[features], columns=cat_features, drop_first=True)
    return y,X

train_y, train_X =  preprocess(train, num_features, cat_features, 'Survived')
test_y, test_X = preprocess(test, num_features, cat_features, 'Survived')  # test_y will be None, since there's no "Survived" column in the test set

Total missing values before processing: 179
Total missing values after processing: 0
Total missing values before processing: 87
Total missing values after processing: 0


  uniques = Index(uniques)
  uniques = Index(uniques)


In [23]:
train_X

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,-0.592481,0.432793,-0.473674,-0.502445,0,1,1,0,1
1,0.638789,0.432793,-0.473674,0.786845,0,0,0,0,0
2,-0.284663,-0.474545,-0.473674,-0.488854,0,1,0,0,1
3,0.407926,0.432793,-0.473674,0.420730,0,0,0,0,1
4,0.407926,-0.474545,-0.473674,-0.486337,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...
886,-0.207709,-0.474545,-0.473674,-0.386671,1,0,1,0,1
887,-0.823344,-0.474545,-0.473674,-0.044381,0,0,0,0,1
888,0.000000,0.432793,2.008933,-0.176263,0,1,0,0,1
889,-0.284663,-0.474545,-0.473674,-0.044381,0,0,1,0,0


In [24]:
train_y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [25]:
test_X

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0.334993,-0.499470,-0.400248,-0.498407,0,1,1,1,0
1,1.325530,0.616992,-0.400248,-0.513274,0,1,0,0,1
2,2.514175,-0.499470,-0.400248,-0.465088,1,0,1,1,0
3,-0.259330,-0.499470,-0.400248,-0.483466,0,1,1,0,1
4,-0.655545,0.616992,0.619896,-0.418471,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
413,0.000000,-0.499470,-0.400248,-0.494448,0,1,1,0,1
414,0.691586,-0.499470,-0.400248,1.313753,0,0,0,0,0
415,0.651965,-0.499470,-0.400248,-0.508792,0,1,1,0,1
416,0.000000,-0.499470,-0.400248,-0.494448,0,1,1,0,1


In [26]:
# test_y is None, since we don't have labels for the test set
assert test_y is None

In [27]:
from sklearn.model_selection import train_test_split
# Split the training data again into a smaller training set and a validation set
smaller_train_X, val_X, smaller_train_y, val_y = train_test_split(train_X, train_y, train_size=0.7, test_size=0.3, random_state=122, stratify=train_y)
smaller_train_X

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
123,0.215540,-0.474545,-0.473674,-0.386671,1,0,0,0,1
565,-0.438572,1.340132,-0.473674,-0.162169,0,1,1,0,1
776,0.000000,-0.474545,-0.473674,-0.492378,0,1,1,1,0
571,1.793104,1.340132,-0.473674,0.388096,0,0,0,0,1
454,0.000000,-0.474545,-0.473674,-0.486337,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...
665,0.177063,1.340132,-0.473674,0.831478,1,0,1,0,1
612,0.000000,0.432793,-0.473674,-0.336334,0,1,0,1,0
595,0.484880,0.432793,0.767630,-0.162169,0,1,1,0,1
468,0.000000,-0.474545,-0.473674,-0.492881,0,1,1,1,0


In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [29]:
# Does change n_neighbors improve accuracy?
classifier = KNeighborsClassifier(n_neighbors=10)
#This fits the model object to the data.
classifier.fit(smaller_train_X, smaller_train_y)
#This creates the prediction. 
smaller_train_y_pred = classifier.predict(smaller_train_X)
val_y_pred = classifier.predict(val_X)
print("Metrics score train: ", metrics.accuracy_score(smaller_train_y, smaller_train_y_pred))
print("Metrics score validation: ", metrics.accuracy_score(val_y, val_y_pred))

Metrics score train:  0.8105939004815409
Metrics score validation:  0.8097014925373134


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [30]:
test_y_pred = classifier.predict(test_X)
test["Survived"] = test_y_pred

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## Challenge
Create a function that can accept any Scikit learn model and assess the perfomance in the validation set, storing results as a dataframe. 