In [1]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [2]:
main_df = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv')
test = pd.merge(test, main_df, on="PassengerId", how="inner")
train = pd.read_csv('train.csv')

In [3]:
train.shape

(8693, 14)

In [4]:
test.shape

(4277, 14)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
 13  Transported   4277 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 438.7+ KB


In [7]:
def split_feature(df, feature, new_features, sep):
    df[new_features] = df[feature].str.split(sep, expand=True)
    return df

In [8]:
def drop_features(df, features):
    df.drop(features, axis=1, inplace=True)
    return df

In [9]:
def cast_feature(df, feature, cast):
    df[feature] = df[feature].astype(cast)
    return df

In [10]:
test = split_feature(test, 'PassengerId', ['GroupId', 'IdWithinGroup'], '_')
train = split_feature(train, 'PassengerId', ['GroupId', 'IdWithinGroup'], '_')

test = split_feature(test, 'Cabin', ['Deck', 'Num', 'Side'], '/')
train = split_feature(train, 'Cabin', ['Deck', 'Num', 'Side'], '/')

test = drop_features(test, ['Name', 'PassengerId', 'Cabin', 'VIP', 'Num', 'Destination'])
train = drop_features(train, ['Name', 'PassengerId', 'Cabin', 'VIP', 'Num', 'Destination'])

test = cast_feature(test, 'GroupId', 'float')
train = cast_feature(train, 'GroupId', 'float')

In [11]:
train.isnull().sum()

HomePlanet       201
CryoSleep        217
Age              179
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
Transported        0
GroupId            0
IdWithinGroup      0
Deck             199
Side             199
dtype: int64

In [12]:
test.isnull().sum()

HomePlanet        87
CryoSleep         93
Age               91
RoomService       82
FoodCourt        106
ShoppingMall      98
Spa              101
VRDeck            80
Transported        0
GroupId            0
IdWithinGroup      0
Deck             100
Side             100
dtype: int64

In [13]:
train = train.replace({True: 1, False: 0})
test = test.replace({True: 1, False: 0})

In [14]:
train

Unnamed: 0,HomePlanet,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupId,IdWithinGroup,Deck,Side
0,Europa,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0,1.0,01,B,P
1,Earth,0.0,24.0,109.0,9.0,25.0,549.0,44.0,1,2.0,01,F,S
2,Europa,0.0,58.0,43.0,3576.0,0.0,6715.0,49.0,0,3.0,01,A,S
3,Europa,0.0,33.0,0.0,1283.0,371.0,3329.0,193.0,0,3.0,02,A,S
4,Earth,0.0,16.0,303.0,70.0,151.0,565.0,2.0,1,4.0,01,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,0.0,41.0,0.0,6819.0,0.0,1643.0,74.0,0,9276.0,01,A,P
8689,Earth,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0,9278.0,01,G,S
8690,Earth,0.0,26.0,0.0,0.0,1872.0,1.0,0.0,1,9279.0,01,G,S
8691,Europa,0.0,32.0,0.0,1049.0,0.0,353.0,3235.0,0,9280.0,01,E,S


In [15]:
def impute_cryo_sleep(df):
    df.loc[
        ((df['RoomService'] == 0.0) | df['RoomService'].isnull()) & 
        ((df['FoodCourt'] == 0.0) | df['FoodCourt'].isnull()) & 
        ((df['ShoppingMall'] == 0.0) | df['ShoppingMall'].isnull()) & 
        ((df['Spa'] == 0.0) | df['Spa'].isnull()) &
        ((df['VRDeck'] == 0.0) | df['VRDeck'].isnull()) &
        (df['CryoSleep'].isnull()), 
        'CryoSleep'
    ] = True
    
    df.loc[
        ((df['RoomService'] > 0.0) | 
        (df['FoodCourt'] > 0.0) | 
        (df['ShoppingMall'] > 0.0) | 
        (df['Spa'] > 0.0) |
        (df['VRDeck'] > 0.0)) & (df['CryoSleep'].isnull()), 
        'CryoSleep'
    ] = False
impute_cryo_sleep(train)
impute_cryo_sleep(test)

In [16]:
train.isnull().sum()

HomePlanet       201
CryoSleep          0
Age              179
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
Transported        0
GroupId            0
IdWithinGroup      0
Deck             199
Side             199
dtype: int64

In [17]:
def impute_home_planet_by_deck(df):
    df.loc[
        (df['Deck'] == 'G') & (df['HomePlanet'].isnull()), 
        'HomePlanet'
    ] = 'Earth'
    
    europa_decks = ['A', 'B', 'C', 'T']
    df.loc[
        (df['Deck'].isin(europa_decks)) & (df['HomePlanet'].isnull()), 
        'HomePlanet'
    ] = 'Europa'

impute_home_planet_by_deck(train)
impute_home_planet_by_deck(test)

In [18]:
train.isnull().sum()

HomePlanet       109
CryoSleep          0
Age              179
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
Transported        0
GroupId            0
IdWithinGroup      0
Deck             199
Side             199
dtype: int64

In [19]:
home_planet_deck = train.groupby(['HomePlanet', 'Deck']).size().unstack().fillna(0)
earth = home_planet_deck.loc['Earth']
earth_proba = list(earth / sum(earth))

europa = home_planet_deck.loc['Europa']
europa_proba = list(europa / sum(europa))

mars = home_planet_deck.loc['Mars']
mars_proba = list(mars / sum(mars))

decks = train['Deck'].unique()
deck_values = sorted(decks[~pd.isnull(decks)])
planet_proba = dict(zip(['Earth', 'Mars', 'Europa'], [earth_proba, mars_proba, europa_proba]))

np.random.seed(2403)

def impute_deck_by_home_planet(df):
    for planet in planet_proba.keys():
        planet_null_decks_shape = df.loc[(df['HomePlanet'] == planet) & (df['Deck'].isnull()), 'Deck'].shape[0]
        df.loc[(df['HomePlanet'] == planet) & (df['Deck'].isnull()), 'Deck'] = np.random.choice(
            deck_values,
            planet_null_decks_shape,
            p=planet_proba[planet]
        )
        
impute_deck_by_home_planet(train)
impute_deck_by_home_planet(test)

In [20]:
train.isnull().sum()

HomePlanet       109
CryoSleep          0
Age              179
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
Transported        0
GroupId            0
IdWithinGroup      0
Deck               6
Side             199
dtype: int64

In [21]:
train[train['Deck'].isnull()]

Unnamed: 0,HomePlanet,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupId,IdWithinGroup,Deck,Side
1550,,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0,1645.0,1,,
1714,,0.0,67.0,143.0,68.0,496.0,182.0,9.0,1,1823.0,1,,
2227,,1.0,11.0,0.0,0.0,0.0,0.0,0.0,1,2384.0,7,,
3858,,1.0,40.0,0.0,0.0,0.0,0.0,0.0,1,4134.0,1,,
6267,,1.0,64.0,0.0,0.0,0.0,0.0,0.0,1,6634.0,2,,
7293,,0.0,21.0,1720.0,2337.0,0.0,1.0,335.0,0,7801.0,1,,


In [22]:
def impute_age_by_planet(df):
    for planet in ['Europa', 'Earth', 'Mars']:
        planet_median = df[df['HomePlanet'] == planet]['Age'].median()
        df.loc[(df['Age'].isnull()) & (df['HomePlanet'] == planet), 'Age'] = planet_median
impute_age_by_planet(train)
impute_age_by_planet(test)

In [23]:
train.isnull().sum()

HomePlanet       109
CryoSleep          0
Age                0
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
Transported        0
GroupId            0
IdWithinGroup      0
Deck               6
Side             199
dtype: int64

In [24]:
def impute_usluga_by_age(df):
    uniq_age = df['Age'].unique()
    uslugi = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for age in uniq_age:
        for usluga in uslugi:
            usluga_median = df[df['Age'] == age][usluga].median()
            df.loc[(df[usluga].isnull()) & (df['Age'] == age), usluga] = usluga_median

impute_usluga_by_age(train)
impute_usluga_by_age(test)

In [25]:
train.isnull().sum()

HomePlanet       109
CryoSleep          0
Age                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Transported        0
GroupId            0
IdWithinGroup      0
Deck               6
Side             199
dtype: int64

In [26]:
train = train[~ (train['Deck'].isnull() & train['HomePlanet'].isnull())]
test = test[~ (test['Deck'].isnull() & test['HomePlanet'].isnull())]

In [27]:
train[train['HomePlanet'].isnull()]

Unnamed: 0,HomePlanet,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupId,IdWithinGroup,Deck,Side
59,,1.0,33.0,0.0,0.0,0.0,0.0,0.0,1,64.0,02,E,S
186,,1.0,24.0,0.0,0.0,0.0,0.0,0.0,1,210.0,01,D,P
225,,0.0,18.0,313.0,1.0,691.0,283.0,0.0,0,242.0,01,F,S
291,,0.0,59.0,1018.0,0.0,209.0,0.0,0.0,0,321.0,01,F,S
365,,1.0,32.0,0.0,0.0,0.0,0.0,0.0,1,402.0,01,D,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8489,,1.0,23.0,0.0,0.0,0.0,0.0,0.0,1,9072.0,01,F,S
8515,,0.0,25.0,1258.0,0.0,22.0,19.0,0.0,0,9084.0,01,E,P
8613,,0.0,53.0,0.0,4017.0,0.0,13.0,3147.0,0,9194.0,01,E,S
8666,,0.0,38.0,28.0,1208.0,973.0,207.0,0.0,1,9248.0,01,F,S


In [28]:
train = train[~ (train['Side'].isnull())]
test = test[~ (test['Side'].isnull())]
train = train[~ (train['HomePlanet'].isnull())]
test = test[~ (test['HomePlanet'].isnull())]

In [29]:
train.shape

(8391, 13)

In [30]:
train.isnull().sum()

HomePlanet       0
CryoSleep        0
Age              0
RoomService      0
FoodCourt        0
ShoppingMall     0
Spa              0
VRDeck           0
Transported      0
GroupId          0
IdWithinGroup    0
Deck             0
Side             0
dtype: int64

In [31]:
test.isnull().sum()

HomePlanet       0
CryoSleep        0
Age              0
RoomService      0
FoodCourt        0
ShoppingMall     0
Spa              0
VRDeck           0
Transported      0
GroupId          0
IdWithinGroup    0
Deck             0
Side             0
dtype: int64

In [32]:
# D = 'Mars', 'Europa'
# E = 'Earth', 'Mars', 'Europa'
# F = 'Earth', 'Mars'

In [33]:
train = train.drop(['HomePlanet', 'Deck', 'Side'], axis=1)
test = test.drop(['HomePlanet', 'Deck', 'Side'], axis=1)

In [34]:
X_train = np.asarray(train.drop(['Transported'], axis=1)).astype('float32')
y_train = np.asarray(train['Transported'])
X_test = np.asarray(test.drop(['Transported'], axis=1)).astype('float32')
y_test = np.asarray(test['Transported'])

In [35]:
y_train

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [36]:
X_train.shape

(8391, 9)

In [37]:
y_train.shape

(8391,)

In [38]:
X_test.shape

(4127, 9)

In [39]:
y_test.shape

(4127,)

In [65]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(1))

In [66]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [67]:
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1c3aa547490>

In [68]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.0
Test accuracy: 0.9714078307151794


In [64]:
# model2 = Sequential()
# model2.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
# model2.add(Dropout(0.2))
# model2.add(Dense(10))

In [45]:
# model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
# model2.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_test, y_test))

In [47]:
# score = model2.evaluate(X_test, y_test, verbose=0)
# print('Test loss:', score[0])
# print('Test accuracy:', score[1])

In [48]:
# model.save('model_accuracy_1_0.keras')
# model2.save('model_accuracy_0_97.keras')

In [49]:
# from keras.models import load_model

# model_o = load_model('model_accuracy_0_97.keras')

In [50]:
# score = model_o.evaluate(X_test, y_test, verbose=0)
# print('Test loss:', score[0])
# print('Test accuracy:', score[1])