In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")
import random

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [3]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [4]:
train, X_test = pd.read_csv('train.csv'), pd.read_csv('test.csv')  
X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1:]

y_train_ = y_train.astype(bool).astype(int)

In [5]:
def split_feature(df, feature, new_features, sep):
    df[new_features] = df[feature].str.split(sep, expand=True)
    return df

In [6]:
def drop_features(df, features):
    df.drop(features, axis=1, inplace=True)
    return df

In [7]:
def cast_feature(df, feature, cast):
    df[feature] = df[feature].astype(cast)
    return df

In [8]:
X_test = split_feature(X_test, 'PassengerId', ['GroupId', 'IdWithinGroup'], '_')
X_train = split_feature(X_train, 'PassengerId', ['GroupId', 'IdWithinGroup'], '_')

X_test = split_feature(X_test, 'Cabin', ['Deck', 'Num', 'Side'], '/')
X_train = split_feature(X_train, 'Cabin', ['Deck', 'Num', 'Side'], '/')

X_test = drop_features(X_test, ['Name', 'PassengerId', 'Cabin', 'VIP', 'Num'])
X_train = drop_features(X_train, ['Name', 'PassengerId', 'Cabin', 'VIP', 'Num'])

X_test = cast_feature(X_test, 'GroupId', 'float')
X_train = cast_feature(X_train, 'GroupId', 'float')

In [9]:
X_train.isnull().sum()

HomePlanet       201
CryoSleep        217
Destination      182
Age              179
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
GroupId            0
IdWithinGroup      0
Deck             199
Side             199
dtype: int64

In [10]:
home_planet_deck = X_train.groupby(['HomePlanet', 'Deck']).size().unstack().fillna(0)
home_planet_deck

Deck,A,B,C,D,E,F,G,T
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Earth,0.0,0.0,0.0,0.0,395.0,1614.0,2498.0,0.0
Europa,252.0,766.0,734.0,186.0,128.0,0.0,0.0,4.0
Mars,0.0,0.0,0.0,282.0,330.0,1110.0,0.0,0.0


In [11]:
def impute_cryo_sleep(df):
    df.loc[
        ((df['RoomService'] == 0.0) | df['RoomService'].isnull()) & 
        ((df['FoodCourt'] == 0.0) | df['FoodCourt'].isnull()) & 
        ((df['ShoppingMall'] == 0.0) | df['ShoppingMall'].isnull()) & 
        ((df['Spa'] == 0.0) | df['Spa'].isnull()) &
        ((df['VRDeck'] == 0.0) | df['VRDeck'].isnull()) &
        (df['CryoSleep'].isnull()), 
        'CryoSleep'
    ] = True
    
    df.loc[
        ((df['RoomService'] > 0.0) | 
        (df['FoodCourt'] > 0.0) | 
        (df['ShoppingMall'] > 0.0) | 
        (df['Spa'] > 0.0) |
        (df['VRDeck'] > 0.0)) & (df['CryoSleep'].isnull()), 
        'CryoSleep'
    ] = False
    return df

X_train = impute_cryo_sleep(X_train)
X_test = impute_cryo_sleep(X_test)

In [12]:
def impute_home_planet_by_deck(df):
    df.loc[
        (df['Deck'] == 'G') & (df['HomePlanet'].isnull()), 
        'HomePlanet'
    ] = 'Earth'
    
    europa_decks = ['A', 'B', 'C', 'T']
    df.loc[
        (df['Deck'].isin(europa_decks)) & (df['HomePlanet'].isnull()), 
        'HomePlanet'
    ] = 'Europa'
    df.loc[
        (df['Deck'] == 'F') & (df['HomePlanet'].isnull()), 
        'HomePlanet'
    ] = 'Mars'

    return df

X_train = impute_home_planet_by_deck(X_train)
X_test = impute_home_planet_by_deck(X_test)

In [13]:
home_planet_deck = X_train.groupby(['HomePlanet', 'Deck']).size().unstack().fillna(0)

earth = home_planet_deck.loc['Earth']
earth_proba = list(earth / sum(earth))

europa = home_planet_deck.loc['Europa']
europa_proba = list(europa / sum(europa))

mars = home_planet_deck.loc['Mars']
mars_proba = list(mars / sum(mars))

decks = X_train['Deck'].unique()
deck_values = sorted(decks[~pd.isnull(decks)]) #['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']
planet_proba = dict(zip(['Earth', 'Mars', 'Europa'], [earth_proba, mars_proba, europa_proba]))

In [14]:
np.random.seed(240304)

In [15]:
def impute_deck_by_home_planet(df):
    for planet in planet_proba.keys():
        planet_null_decks_shape = df.loc[(df['HomePlanet'] == planet) & (df['Deck'].isnull()), 'Deck'].shape[0]
        df.loc[(df['HomePlanet'] == planet) & (df['Deck'].isnull()), 'Deck'] = np.random.choice(deck_values, planet_null_decks_shape, p=planet_proba[planet]) 
    return df
               
X_train = impute_deck_by_home_planet(X_train)
X_test = impute_deck_by_home_planet(X_test)

In [16]:
def impute_age_by_planet(df):
    for planet in ['Europa', 'Earth', 'Mars']:
        planet_median = df[df['HomePlanet'] == planet]['Age'].median()
        df.loc[(df["Age"].isnull()) & (df["HomePlanet"] == planet),"Age"] = planet_median
    return df

X_train = impute_age_by_planet(X_train)
X_test = impute_age_by_planet(X_test)

In [17]:
X_train.isnull().sum()

HomePlanet        39
CryoSleep          0
Destination      182
Age                0
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
GroupId            0
IdWithinGroup      0
Deck               6
Side             199
dtype: int64

In [18]:
def impute_usluga_by_age(df):
    uniq_age = df['Age'].unique()
    uslugi = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for age in uniq_age:
        for usluga in uslugi:
            usluga_median = df[df['Age'] == age][usluga].median()
            df.loc[(df[usluga].isnull()) & (df['Age'] == age), usluga] = usluga_median
    return df

X_train = impute_usluga_by_age(X_train)
X_test = impute_usluga_by_age(X_test)

In [19]:
X_train.isnull().sum()

HomePlanet        39
CryoSleep          0
Destination      182
Age                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
GroupId            0
IdWithinGroup      0
Deck               6
Side             199
dtype: int64

In [20]:
numerical_columns = X_train.describe().columns
categorical_columns = set(X_train.columns) - set(numerical_columns)

In [21]:
for col in numerical_columns:
    si = SimpleImputer(strategy='median')
    X_train[col] = si.fit_transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = si.fit_transform(X_test[col].values.reshape(-1, 1))    

In [22]:
for col in categorical_columns:
    si = SimpleImputer(strategy='most_frequent')
    X_train[[col]] = si.fit_transform(X_train[[col]])

In [23]:
def log_transform_data(df):
    for col in numerical_columns[1:-1]:
        df[col] = np.log(1 + df[col])
    return df

X_train = log_transform_data(X_train)
X_test = log_transform_data(X_test)

In [24]:
X_train_ = pd.get_dummies(X_train)
X_test_ = pd.get_dummies(X_test)

In [None]:
best_acc = 0.0
best_model = None
while best_acc<0.81:
    seed = random.randint(0, 500000)
    X_train, X_test, y_train, y_test = train_test_split(X_train_, y_train_, test_size=0.25, random_state=seed)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # Преобразуйте DataFrame целевой переменной в массив NumPy
    y_train = y_train.values.ravel()
    y_test = y_test.values.ravel()
    
    # Преобразование массивов NumPy в тензоры PyTorch
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
    class TitanicClassifier(nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(TitanicClassifier, self).__init__()
            self.fc1 = nn.Linear(input_size, hidden_size)
            self.relu = nn.ReLU()
            self.fc2 = nn.Linear(hidden_size, output_size)
            self.sigmoid = nn.Sigmoid()
    
        def forward(self, x):
            out = self.fc1(x)
            out = self.relu(out)
            out = self.fc2(out)
            out = self.sigmoid(out)
            return out
    # Инициализация модели, функции потерь и оптимизатора
    input_size = X_train.shape[1]
    hidden_size = 64
    output_size = 1
    model = TitanicClassifier(input_size, hidden_size, output_size)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Обучение модели

    for epoch in range(10, 150):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor.view(-1, 1))
        loss.backward()
        optimizer.step()
    
    # Оценка модели на тестовом наборе данных
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        predicted = (outputs > 0.5).float()
        accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
        
    if accuracy > best_acc:
        best_acc = accuracy
        best_model = model
        print(f'Accuracy on test set: {accuracy} for num_epochs = {num_epochs} seed = {seed}')

Accuracy on test set: 0.7741490340386384 for num_epochs = 64 seed = 205282
Accuracy on test set: 0.7764489420423183 for num_epochs = 64 seed = 273780
Accuracy on test set: 0.781048758049678 for num_epochs = 64 seed = 285812
Accuracy on test set: 0.7842686292548298 for num_epochs = 64 seed = 330951
Accuracy on test set: 0.7888684452621895 for num_epochs = 64 seed = 294690
Accuracy on test set: 0.7930082796688133 for num_epochs = 64 seed = 56632
Accuracy on test set: 0.7994480220791168 for num_epochs = 64 seed = 419034


In [None]:
# Выполнение прямого прохода (forward pass) через модель для тестового набора данных
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predicted = (outputs > 0.5).float()  # Преобразование вероятностей в бинарные значения (0 или 1)
# Создание DataFrame с предсказанными значениями
sub = pd.DataFrame()
sub['PassengerId'] = pd.read_csv('test.csv')['PassengerId']  # Предположим, что в вашем тестовом CSV файле есть столбец 'PassengerId'
sub['Transported'] = pd.Series(predicted.numpy().flatten()).astype(bool)  # Преобразование предсказанных значений в тип bool

# Сохранение предсказанных значений в CSV файл
# sub.to_csv('submission.csv', index=False)