In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
data = pd.read_csv('../input/titanic/train.csv')
data.head()

# Choosing features

It is better to remove "Cabin" becauese this column has too many nan values

In [3]:
data.isna().sum()

In [4]:
len(data)

I won't also use ticket column as it has too many unique values

In [5]:
print(data['Ticket'].nunique(), len(data))

Difference between mean and median values is not that big but I still prefer to use median

In [6]:
print(data['Age'].mean())
print(data['Age'].median())

In [7]:
data['Embarked'].value_counts()

Transforming categorical features in numerical

In [8]:
gender = {'male': 0, 'female': 1}
data['Sex'] = [gender[x] for x in data['Sex']]
embarked = {'C': 0, 'S': 1, 'Q': 2}
data['Embarked'] = data['Embarked'].fillna('S')
data['Embarked'] = [embarked[x] for x in data['Embarked']]

# Visualisation

It is clear from the heatmap that the target feature is influenced mostly by Pclass, Fare and Sex. Age, SibSp and Parch can be dropped.

In [9]:
import seaborn as sns

features = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
sns.heatmap(data[features].corr(), annot=True)

For visualising ordinal features I will use bar charts

In [10]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
axs[0].bar(data['Sex'].unique(), data['Sex'].value_counts())
axs[0].set_title('Sex')

axs[1].bar(data['Pclass'].unique(), data['Pclass'].value_counts())
axs[1].set_title('Pclass')

plt.show()

For visualising numerical features I will use distplots

In [11]:
features = data[['Fare', 'Age']].select_dtypes(['float64','int64']).columns.to_list()

for f in features: 
    plt.figure()
    plt.title(f)
    ax = sns.distplot(data[f])

In [12]:
from sklearn.model_selection import train_test_split

f = ['Sex', 'Pclass', 'Fare']
x_train, x_test, y_train, y_test = train_test_split(data[f], data['Survived'], test_size=0.2, random_state=74)

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import f1_score, roc_auc_score

cb = CatBoostClassifier()
cb.fit(x_train, y_train)

train_pred = cb.predict(x_train)
test_pred = cb.predict(x_test)

print('Train accuracy')
print(f'F score: {f1_score(y_train, train_pred)}')
print(f'ROC-AUC: {roc_auc_score(y_train, train_pred)}')
print('\n')

print('Test accuracy')
print(f'F score: {f1_score(y_test, test_pred)}')
print(f'ROC-AUC: {roc_auc_score(y_test, test_pred)}')

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'learning_rate': [0.04, 0.07, 0.1, 0.13], 'depth': [4, 6, 8], 'iterations': [250, 500, 1000]}
search = GridSearchCV(CatBoostClassifier(), params)
search.fit(x_train, y_train)

In [None]:
search.best_params_

In [None]:
x_train.head()

In [None]:
cb1 = CatBoostClassifier(learning_rate=0.05, max_depth=4, n_estimators=250)
cb1.fit(x_train, y_train)

train_pred = cb1.predict(x_train)
test_pred = cb1.predict(x_test)

print('Train accuracy')
print(f'F score: {f1_score(y_train, train_pred)}')
print(f'ROC-AUC: {roc_auc_score(y_train, train_pred)}')
print('\n')

print('Test accuracy')
print(f'F score: {f1_score(y_test, test_pred)}')
print(f'ROC-AUC: {roc_auc_score(y_test, test_pred)}')

In [None]:
test = pd.read_csv('../input/titanic/test.csv')
test['Sex'] = [gender[x] for x in test['Sex']]
test = test[['Sex', 'Fare', 'Pclass']]
test.isna().any()

In [None]:
test = test.fillna(test['Fare'].median())
prediction = cb1.predict(test)
prediction

In [None]:
import csv

test = pd.read_csv('../input/titanic/test.csv')

with open('submission1.csv', 'w') as file:
  wr = csv.writer(file, delimiter=',')
  wr.writerow(['PassengerId', 'Survived'])
  for i in range(len(prediction)):
    wr.writerow([test['PassengerId'][i], prediction[i]])

This gives 0.78229