# <center> Обучение с учителем. Методы классификации
## <center>Практика. Дерево решений в задаче предсказания выживания пассажиров "Титаника". Решение



**<a href="https://www.kaggle.com/c/titanic">Соревнование</a> Kaggle "Titanic: Machine Learning from Disaster".**

In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

**Функция для формирования csv-файла:**

In [4]:
def write_to_submission_file(
    predicted_labels,
    out_file,
    train_num=891,
    target="Survived",
    index_label="PassengerId",
):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(
        predicted_labels,
        index=np.arange(train_num + 1, train_num + 1 + predicted_labels.shape[0]),
        columns=[target],
    )
    predicted_df.to_csv(out_file, index_label=index_label)

**Считываем обучающую и тестовую выборки.**

In [5]:
train_df = pd.read_csv("data/titanic_train.csv")
test_df = pd.read_csv("data/titanic_test.csv")

In [6]:
y = train_df["Survived"]

In [7]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
train_df.describe(include="all")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [9]:
test_df.describe(include="all")

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Kelly, Mr. James",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,39.0,1.0,0.0,,31.5,,


**Заполним пропуски медианными значениями.**

In [10]:
train_df["Age"].fillna(train_df["Age"].median(), inplace=True)
test_df["Age"].fillna(train_df["Age"].median(), inplace=True)
train_df["Embarked"].fillna("S", inplace=True)
test_df["Fare"].fillna(train_df["Fare"].median(), inplace=True)

**Кодируем категориальные признаки `Pclass`, `Sex`, `SibSp`, `Parch` и `Embarked` с помощью техники One-Hot-Encoding.**

In [11]:
train_df = pd.concat(
    [
        train_df,
        pd.get_dummies(train_df["Pclass"], prefix="PClass"),
        pd.get_dummies(train_df["Sex"], prefix="Sex"),
        pd.get_dummies(train_df["SibSp"], prefix="SibSp"),
        pd.get_dummies(train_df["Parch"], prefix="Parch"),
        pd.get_dummies(train_df["Embarked"], prefix="Embarked"),
    ],
    axis=1,
)
test_df = pd.concat(
    [
        test_df,
        pd.get_dummies(test_df["Pclass"], prefix="PClass"),
        pd.get_dummies(test_df["Sex"], prefix="Sex"),
        pd.get_dummies(test_df["SibSp"], prefix="SibSp"),
        pd.get_dummies(test_df["Parch"], prefix="Parch"),
        pd.get_dummies(test_df["Embarked"], prefix="Embarked"),
    ],
    axis=1,
)

In [12]:
train_df.drop(
    [
        "Survived",
        "Pclass",
        "Name",
        "Sex",
        "SibSp",
        "Parch",
        "Ticket",
        "Cabin",
        "Embarked",
        "PassengerId",
    ],
    axis=1,
    inplace=True,
)
test_df.drop(
    [
        "Pclass",
        "Name",
        "Sex",
        "SibSp",
        "Parch",
        "Ticket",
        "Cabin",
        "Embarked",
        "PassengerId",
    ],
    axis=1,
    inplace=True,
)

**В тестовой выборке появляется новое значение Parch = 9, которого нет в обучающей выборке. Проигнорируем его.**

In [13]:
train_df.shape, test_df.shape

((891, 24), (418, 25))

In [14]:
set(test_df.columns) - set(train_df.columns)

{'Parch_9'}

In [15]:
test_df.drop(["Parch_9"], axis=1, inplace=True)

In [16]:
train_df.head()

Unnamed: 0,Age,Fare,PClass_1,PClass_2,PClass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,False,False,True,False,True,False,True,False,...,True,False,False,False,False,False,False,False,False,True
1,38.0,71.2833,True,False,False,True,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False
2,26.0,7.925,False,False,True,True,False,True,False,False,...,True,False,False,False,False,False,False,False,False,True
3,35.0,53.1,True,False,False,True,False,False,True,False,...,True,False,False,False,False,False,False,False,False,True
4,35.0,8.05,False,False,True,False,True,True,False,False,...,True,False,False,False,False,False,False,False,False,True


In [17]:
test_df.head()

Unnamed: 0,Age,Fare,PClass_1,PClass_2,PClass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,34.5,7.8292,False,False,True,False,True,True,False,False,...,True,False,False,False,False,False,False,False,True,False
1,47.0,7.0,False,False,True,True,False,False,True,False,...,True,False,False,False,False,False,False,False,False,True
2,62.0,9.6875,False,True,False,False,True,True,False,False,...,True,False,False,False,False,False,False,False,True,False
3,27.0,8.6625,False,False,True,False,True,True,False,False,...,True,False,False,False,False,False,False,False,False,True
4,22.0,12.2875,False,False,True,True,False,False,True,False,...,False,True,False,False,False,False,False,False,False,True


## 1. Дерево решений без настройки параметров 

**Обучите на имеющейся выборке дерево решений (`DecisionTreeClassifier`) максимальной глубины 2. Используйте параметр `random_state=17` для воспроизводимости результатов.**

In [18]:
tree = DecisionTreeClassifier(max_depth=2, random_state=17)
tree.fit(train_df, y)

**Сделайте с помощью полученной модели прогноз для тестовой выборки **

In [19]:
predictions = tree.predict(test_df)

**Сформируйте результат и измерьте точность**

In [20]:
write_to_submission_file(predictions, "titanic_tree_predictions.csv")

<font color='red'>Вопрос 1. </font> Каков результат первой посылки (дерево решений без настройки параметров) в публичном рейтинге соревнования Titanic?
- 0.77033


**Отобразите дерево с помощью `export_graphviz` и `dot`.**

In [21]:
export_graphviz(tree, out_file="tree.dot", feature_names=train_df.columns, filled=True)

<font color='red'>Вопрос 2. </font> Сколько признаков задействуются при прогнозе деревом решений глубины 2?
- 3

## 2. Дерево решений с настройкой параметров 

**Обучите на имеющейся выборке дерево решений (`DecisionTreeClassifier`). Также укажите `random_state=17`. Максимальную глубину и минимальное число элементов в листе настройте на 5-кратной кросс-валидации с помощью `GridSearchCV`.**

In [64]:
# tree params for grid search
tree_params = {"max_depth": list(range(1, 5)), "min_samples_leaf": list(range(1, 5))}

best_tree = GridSearchCV(DecisionTreeClassifier(random_state=17), 
                                 tree_params, 
                                 verbose=True, n_jobs=-1, cv=5)
best_tree.fit(train_df, y)

print("Best params:", best_tree.best_params_)
print("Best CV score", best_tree.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'max_depth': 3, 'min_samples_leaf': 3}
Best CV score 0.8103132257862031


<font color='red'>Вопрос 3. </font> Каковы лучшие параметры дерева, настроенные на кросс-валидации с помощью `GridSearchCV`?
- max_depth=3, min_samples_leaf=3

<font color='red'>Вопрос 4. </font> Какой получилась средняя доля верных ответов на кросс-валидации для дерева решений с лучшим сочетанием гиперпараметров `max_depth` и `min_samples_leaf`?
- 0.81

**Сделайте с помощью полученной модели прогноз для тестовой выборки.**

In [65]:
predictions = best_tree.predict(test_df)

**Сформируйте файл посылки  и измерьет результат**

In [66]:
write_to_submission_file(predictions, 'titanic_tree_tuned_predictions.csv')

<font color='red'>Вопрос 5. </font> Каков результат второй посылки (дерево решений с настройкой гиперпараметров) в публичном рейтинге соревнования Titanic?
- 0.77751

## 3. Лес

In [53]:
from sklearn.ensemble import RandomForestClassifier

forest_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [54]:
forest_grid_search = GridSearchCV(RandomForestClassifier(random_state=17), forest_params, cv=5, n_jobs=-1)
forest_grid_search.fit(train_df, y)

In [56]:
best_forest_params = forest_grid_search.best_params_
print("Лучшие параметры RandomForest:", best_forest_params)

best_forest_score = forest_grid_search.best_score_
print("Средняя доля верных ответов RandomForest:", best_forest_score)

forest_test_predictions = forest_grid_search.predict(test_df)

write_to_submission_file(forest_test_predictions, "titanic_random_forest_predictions.csv")

Лучшие параметры RandomForest: {'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Средняя доля верных ответов RandomForest: 0.8227229929069111


## 4. КНН

In [58]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {
    "n_neighbors": [3, 5, 7, 9],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
}

In [67]:
knn_grid_search = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, n_jobs=-1)
knn_grid_search.fit(train_df, y)

Traceback (most recent call last):
  File "/opt/anaconda3/envs/GMCS/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/envs/GMCS/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
  File "/opt/anaconda3/envs/GMCS/lib/python3.8/site-packages/sklearn/base.py", line 705, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/opt/anaconda3/envs/GMCS/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 246, in predict
    if self._fit_method == "brute" and ArgKminClassMode.is_usable_for(
  File "/opt/anaconda3/envs/GMCS/lib/python3.8/site-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py", line 471, in is_usable_for
    ArgKmin.is_usable_for(X, Y, metric)
  File "/opt/anaconda3/envs/GMCS/lib/python3.8/site-packages/sklearn/metrics/_pairwi

In [60]:
best_knn_params = knn_grid_search.best_params_
print("Лучшие параметры KNeighbors:", best_knn_params)

best_knn_score = knn_grid_search.best_score_
print("Средняя доля верных ответов KNeighbors:", best_knn_score)

Лучшие параметры KNeighbors: {'algorithm': 'ball_tree', 'n_neighbors': 3, 'weights': 'uniform'}
Средняя доля верных ответов KNeighbors: 0.7217061075889775


In [61]:
knn_test_predictions = knn_grid_search.predict(test_df)

write_to_submission_file(knn_test_predictions, "titanic_knn_predictions.csv")

## Ссылки:

 - <a href="https://www.kaggle.com/c/titanic">Соревнование</a> Kaggle "Titanic: Machine Learning from Disaster"
 - <a href="https://www.dataquest.io/mission/74/getting-started-with-kaggle/">Тьюториал</a> Dataquest по задаче Kaggle "Titanic: Machine Learning from Disaster"