# Задача 5. Паром.

Получите accuracy > 0.89 на тестовом датасете. Можно пользоваться любым классификатором **из sklearn**. Ансамблями пользоваться можно.

### XtreemeGradientBoosting и нейронные сети запрещены. 

In [1]:
!pip install jupyter_black
import jupyter_black
jupyter_black.load()

import time
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score

# Random state
RS = 1337

Collecting jupyter_black
  Downloading jupyter_black-0.4.0-py3-none-any.whl.metadata (7.8 kB)
Collecting black>=21 (from black[jupyter]>=21->jupyter_black)
  Downloading black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.2/79.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting packaging>=22.0 (from black>=21->black[jupyter]>=21->jupyter_black)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Collecting pathspec>=0.9.0 (from black>=21->black[jupyter]>=21->jupyter_black)
  Downloading pathspec-0.12.1-py3-none-any.whl.metadata (21 kB)
Collecting tokenize-rt>=3.2.0 (from black[jupyter]>=21->jupyter_black)
  Downloading tokenize_rt-6.1.0-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading jupyter_black-0.4.0-py3-none-any.whl (7.6 kB)
Downloading black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_

In [2]:
df = pd.read_csv(
    "/kaggle/input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv",
    index_col="PassengerId",
)
df = df.reset_index(drop=True)

## 1. Первичный анализ

In [3]:
def get_data_info(data):
    display(data.head(40))
    display(data.describe(include="all"))
    data.info()


get_data_info(df)

Unnamed: 0,Country,Firstname,Lastname,Sex,Age,Category,Survived
0,Sweden,ARVID KALLE,AADLI,M,62,P,0
1,Estonia,LEA,AALISTE,F,22,C,0
2,Estonia,AIRI,AAVASTE,F,21,C,0
3,Sweden,JURI,AAVIK,M,53,C,0
4,Sweden,BRITTA ELISABET,AHLSTROM,F,55,P,0
5,Sweden,GERD INGA MAGNHILD,AHLSTROM,F,71,P,0
6,Sweden,HJALMAR,AHLSTROM,M,60,P,0
7,Estonia,PILLE,AHMAN,F,18,P,0
8,Estonia,TORMI,AINSALU,M,30,C,0
9,Sweden,ANNA MARIA,ALDRIN,F,63,P,0


Unnamed: 0,Country,Firstname,Lastname,Sex,Age,Category,Survived
count,989,989,989,989,989.0,989,989.0
unique,16,849,774,2,,2,
top,Sweden,ANDRES,ANDERSSON,M,,P,
freq,550,8,15,503,,796,
mean,,,,,44.575329,,0.138524
std,,,,,17.235146,,0.345624
min,,,,,0.0,,0.0
25%,,,,,30.0,,0.0
50%,,,,,44.0,,0.0
75%,,,,,59.0,,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Country    989 non-null    object
 1   Firstname  989 non-null    object
 2   Lastname   989 non-null    object
 3   Sex        989 non-null    object
 4   Age        989 non-null    int64 
 5   Category   989 non-null    object
 6   Survived   989 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 54.2+ KB


In [4]:
# Проверка на пропуски
pd.DataFrame(
    round(
        df.isna().mean() * 100,
    )
).style.background_gradient("coolwarm")

Unnamed: 0,0
Country,0.0
Firstname,0.0
Lastname,0.0
Sex,0.0
Age,0.0
Category,0.0
Survived,0.0


### Вывод:
1. Категориальные признаки Country, Sex, Category необходимо закодировать
2. Колонку с возрастом можем сделать категориальной
3. Колонки с именем и фамилией не информативны, удалим их
4. Пропусков нет

## 2. Подготовка данных

### 2.1. Добавим категориальный столбец для возраста

In [5]:
def age_type(line):

    if not line:
        return "None"

    if line > 64:
        return "Old"

    if line > 30:
        return "Middle"

    if line > 17:
        return "Young"

    if line > 2:
        return "Child"

    return "Baby"


df["AgeType"] = df["Age"].apply(age_type)

### Большинство пассажиров было из Швеции и Эстонии, оставим эти страны, остальные объединим под одним типом

In [6]:
df["CountryType"] = df.Country.apply(
    lambda x: "Estonia" if x == "Estonia" else ("Sweden" if x == "Sweden" else "Others")
)

### 2.2. Обозначим признаки и цель прогнозирования

In [7]:
features = df.drop(["Survived", "Country", "Firstname", "Lastname"], axis=1)
target = df["Survived"]

### 2.3. Разобъем данные на обучающую и тестовую выборки

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.3,
    random_state=RS,
    stratify=df["Survived"],
)

### 2.4. Кодирование данных

In [9]:
numeric_features = ["Age"]
categorical_features = ["Sex", "Category", "AgeType", "CountryType"]

column_transformer = ColumnTransformer(
    transformers=[
        (
            "ohe",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_features,
        ),
        ("scaling", StandardScaler(), numeric_features),
    ],
    remainder="passthrough",
)

## 3. Обучение модели
Обучаем ансамбль моделей: knn, gradient boosting, linear regression, random forest. Используем баланс классов

In [10]:
knn = Pipeline(
    steps=[
        ("preprocessor", column_transformer),
        (
            "classifier",
            KNeighborsClassifier(),
        ),
    ]
)

gb = Pipeline(
    steps=[
        ("preprocessor", column_transformer),
        (
            "classifier",
            GradientBoostingClassifier(random_state=RS),
        ),
    ]
)

lr = Pipeline(
    steps=[
        ("preprocessor", column_transformer),
        (
            "classifier",
            LogisticRegression(random_state=RS, class_weight="balanced"),
        ),
    ]
)

rf = Pipeline(
    steps=[
        ("ohe_and_scaling", column_transformer),
        (
            "BalanceRF",
            RandomForestClassifier(
                class_weight={0: 0.423, 1: 0.589},
                random_state=RS,
                n_estimators=500,
                n_jobs=-1,
            ),
        ),
    ]
)

ensemble = VotingClassifier(
    estimators=[
        ("rf", rf),
        ("lr", lr),
        ("gb", gb),
        ("knn", knn),
    ],
    voting="soft",
)

## 4. Проверка на тестовой выборке

In [11]:
model = ensemble.fit(X_train, y_train)
y_preds = model.predict(X_test)
acc = accuracy_score(y_test, y_preds)

acc

0.8585858585858586