## Import packages

In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer, QuantileTransformer
from sklearn.metrics import classification_report
from time import time
import pandas as pd

In [2]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

### Helper function to get and process the data

In [3]:
def dataget(train_path, test_path):

    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    #Join the train and test data to cleanse and enhance the data
    df = train_data.append(test_data, ignore_index=True)
    Titles_Dictionary = {
                        "Capt":         "Officer",
                        "Col":          "Officer",
                        "Major":        "Officer",
                        "Jonkheer":     "Royalty",
                        "Don":          "Royalty",
                        "Sir":          "Royalty",
                        "Dr":           "Officer",
                        "Rev":          "Officer",
                        "the Countess": "Royalty",
                        "Dona":         "Royalty",
                        "Mme":          "Mrs",
                        "Mlle":         "Miss",
                        "Ms":           "Mrs",
                        "Mr":           "Mr",
                        "Mrs":          "Mrs",
                        "Miss":         "Miss",
                        "Master":       "Master",
                        "Lady":         "Royalty"
                        }
    ## Extract Title and map to the Titles from each Name
    df['Title'] = df['Name'].apply(lambda x: Titles_Dictionary[x.split(',')[1].split('.')[0].strip()])
    ## Fill missing Embarked with 'C'
    df['Embarked'].fillna('C', inplace=True)
    ## Note down the Imputed Ages
    df['Imputed'] = df['Age'].isnull().astype('uint8')
    columns = ['Age','Fare']
    groups = ['Title', 'Embarked']
    ## Fill null Ages with the mean Age based on Title, Embarked
    df[columns] = df.groupby(groups)[columns].transform(lambda x: x.fillna(x.mean()))
    ## Convert to categorical data
    categories = ['Title', 'Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked']
    df[categories] = df[categories].apply(lambda x: x.astype('category'))
    df = df.drop(columns=['Cabin', 'Name', 'Ticket'])
    #df = df.drop(columns=['Title', 'SibSp', 'Imputed', 'Pclass', 'Parch', 'Embarked', 'Fare'])
    df = df.round(2)
    original = df.copy()
    df = pd.get_dummies(df, drop_first=True)
    test_data = df[df.Survived.isnull()].copy()
    test_data = test_data.drop(columns=['Survived'])
    train_data = df.dropna().copy()
    train_data['Survived'] = train_data['Survived'].astype('uint8')
    train_data = train_data.drop(columns=['PassengerId'])

    return original, train_data, test_data

In [4]:
train_path = 'train.csv'
test_path = 'test.csv'
original, train_data, test_data = dataget(train_path, test_path)
df = original.copy()
df.drop(columns=['PassengerId'], inplace=True)
#df.dropna(inplace=True)
print(df.head(10))

     Age Embarked   Fare Parch Pclass     Sex SibSp  Survived   Title  Imputed
0  22.00        S   7.25     0      3    male     1       0.0      Mr        0
1  38.00        C  71.28     0      1  female     1       1.0     Mrs        0
2  26.00        S   7.92     0      3  female     0       1.0    Miss        0
3  35.00        S  53.10     0      1  female     1       1.0     Mrs        0
4  35.00        S   8.05     0      3    male     0       0.0      Mr        0
5  36.24        Q   8.46     0      3    male     0       0.0      Mr        1
6  54.00        S  51.86     0      1    male     0       0.0      Mr        0
7   2.00        S  21.08     1      3    male     3       0.0  Master        0
8  27.00        S  11.13     2      3  female     0       1.0     Mrs        0
9  14.00        C  30.07     0      2  female     1       1.0     Mrs        0


In [5]:
print(df.describe())

               Age         Fare    Survived      Imputed
count  1309.000000  1309.000000  891.000000  1309.000000
mean     30.085829    33.285921    0.383838     0.200917
std      13.214767    51.740153    0.486592     0.400839
min       0.170000     0.000000    0.000000     0.000000
25%      22.000000     7.900000    0.000000     0.000000
50%      30.000000    14.450000    0.000000     0.000000
75%      36.240000    31.280000    1.000000     0.000000
max      80.000000   512.330000    1.000000     1.000000


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
Age         1309 non-null float64
Embarked    1309 non-null category
Fare        1309 non-null float64
Parch       1309 non-null category
Pclass      1309 non-null category
Sex         1309 non-null category
SibSp       1309 non-null category
Survived    891 non-null float64
Title       1309 non-null category
Imputed     1309 non-null uint8
dtypes: category(6), float64(3), uint8(1)
memory usage: 40.9 KB
None


In [7]:
search = True
epochs = 500
normalizer = Normalizer(norm='l1')
df = train_data
train = df.dropna()
y_train = train['Survived'].values.astype(int)
x_train = train.drop(columns=['Survived']).values
quantile_transformer = QuantileTransformer(output_distribution='normal')
X_train = normalizer.fit_transform(x_train)
#X_train = quantile_transformer.fit_transform(x_train)

In [8]:
params_dict ={'n_neighbors': Integer(1,50),
              'leaf_size': Integer(1,100),
              'algorithm': Categorical(['ball_tree', 'kd_tree']),
              'weights': Categorical(['uniform', 'distance'])
        }
if search:
    random_search = BayesSearchCV(estimator=KNeighborsClassifier(),
                                  search_spaces=params_dict,
                                  scoring='accuracy',
                                  n_iter=50,
                                  cv=10,
                                  verbose=0,
                                  n_jobs=-1
                                 )

    start = time()
    random_search.fit(X_train, y_train)
    print("BayesSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), random_search.total_iterations))

    print("val. score: %s" % random_search.best_score_)
    print("test score: %s" % random_search.score(X_train, y_train))
    print(random_search.best_params_)

BayesSearchCV took 394.96 seconds for 200 candidates parameter settings.
val. score: 0.755331088664422
test score: 0.7957351290684624
{'algorithm': 'ball_tree', 'leaf_size': 100, 'n_neighbors': 8, 'weights': 'uniform'}


In [9]:
print(random_search.best_score_)
print(random_search.best_params_)

0.755331088664422
{'algorithm': 'ball_tree', 'leaf_size': 100, 'n_neighbors': 8, 'weights': 'uniform'}


### Preprocessing Docs
#### http://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py
### Bayes Search CV docs
#### https://github.com/scikit-optimize/scikit-optimize/blob/master/skopt/searchcv.py

In [10]:
#params = {'algorithm': 'ball_tree', 'leaf_size': 2, 'n_neighbors': 8, 'weights': 'uniform'}
params = {'algorithm': 'ball_tree', 'leaf_size': 100, 'n_neighbors': 8, 'weights': 'uniform'}
clf = KNeighborsClassifier(**params)
clf.fit(X_train, y_train)
y_true, y_pred = y_train, clf.predict(X_train)

In [11]:
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.80      0.90      0.84       549
          1       0.80      0.63      0.70       342

avg / total       0.80      0.80      0.79       891



In [12]:
df = test_data
resultdf = pd.DataFrame(data=df['PassengerId'])
df = df.drop(columns=['PassengerId'])
#test_x = quantile_transformer.transform(df)
test_x = normalizer.transform(df)
predictions = clf.predict(test_x)
resultdf['Survived'] = predictions.astype(int)

resultdf.to_csv('submission.csv', index=False)