In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score, precision_recall_curve, classification_report

from sklearn.feature_selection import SelectFromModel, GenericUnivariateSelect, mutual_info_classif
import sys
import keras
from keras.datasets import fashion_mnist
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import multilabel_confusion_matrix, classification_report,  ConfusionMatrixDisplay
import matplotlib.pyplot as plt
#np.set_printoptions(threshold=sys.maxsize)

  from pandas import MultiIndex, Int64Index


In [2]:
def some_prepare(X):
    X['group'] = X['PassengerId'].apply(lambda x: x[-2:])
    X.loc[X["Cabin"].isnull(), "Cabin"] = '-1/-1/-1'
    X[['cab1','cab2','cab3']] = pd.DataFrame(X['Cabin'].str.split('/').tolist(),
                                 columns = ['cab1','cab2','cab3'])
    X['CryoSleep'] = X['CryoSleep'].replace({True: 1, False : 0})
    X['VIP'] = X['VIP'].replace({True: 1, False : 0})
    X['NO_VR_SPA'] = ((X['Spa'] == 0 ) & (X['VRDeck'] == 0)).astype(int) 
    num_features = ['Age', 'RoomService', 'FoodCourt',  'ShoppingMall', 'Spa', 'VRDeck']
    return X

In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [4]:
obj_features = ['HomePlanet',  'Destination', 'cab1','cab3']
bin_features = ['CryoSleep', 'VIP', 'NO_VR_SPA']
num_features = ['Age', 'RoomService', 'FoodCourt','cab2',  'ShoppingMall', 'Spa', 'VRDeck']

In [5]:
all_feats = obj_features + num_features + bin_features

In [6]:
f_prep_pipeline = make_pipeline(
    ColumnSelector(columns=all_feats),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            ColumnSelector(num_features),
            SimpleImputer(strategy="mean"),
            Normalizer()
        )),
        ("categorical_features", make_pipeline(
            ColumnSelector(obj_features),
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown='ignore', drop='first')
        )),
        ("boolean_features", make_pipeline(
            ColumnSelector(bin_features),
            SimpleImputer(strategy="most_frequent")
            
        ))
    ])
)

In [7]:
df = pd.read_csv('train.csv')

In [8]:
df['Transported'] = df['Transported'].astype(int)

In [9]:
X_tr, X_val, y_tr, y_val = train_test_split(df.drop('Transported', axis=1), df['Transported'], random_state=7)

In [10]:
X_tr = some_prepare(X_tr)

In [11]:
X_val = some_prepare(X_val)

In [12]:
X_tr = f_prep_pipeline.fit_transform(X_tr)

In [13]:
X_val = f_prep_pipeline.transform(X_val)

In [14]:
X_val

<2174x24 sparse matrix of type '<class 'numpy.float64'>'
	with 17193 stored elements in Compressed Sparse Row format>

In [15]:
X_tr = X_tr.toarray() 

In [16]:
X_tr

array([[0.06942151, 0.        , 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.02330767, 0.02542655, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01967283, 0.        , 0.73527221, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06621591, 0.        , 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.00459822, 0.        , 0.20753289, ..., 0.        , 0.        ,
        0.        ],
       [0.01771059, 0.        , 0.        , ..., 1.        , 0.        ,
        1.        ]])

In [17]:
X_tr[0]

array([0.06942151, 0.        , 0.        , 0.99758742, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 1.        ,
       0.        , 1.        , 0.        , 1.        ])

In [28]:
y_tr

0       1
1       0
2       1
3       0
4       0
       ..
6514    1
6515    0
6516    1
6517    1
6518    1
Name: Transported, Length: 6519, dtype: int32

In [20]:
X_val = X_val.toarray() 

In [21]:
y_val.reset_index(drop=True, inplace=True)

In [22]:
y_tr.reset_index(drop=True, inplace=True)

In [23]:
model = Sequential([
  Dense(30, activation='sigmoid', input_shape=(24,)),  
  Dense(1, activation='sigmoid'),
])


model.compile(
  optimizer='adam',
  loss='categorical_crossentropy',
  metrics=['accuracy'],
)

In [24]:
model.fit(    
  X_tr,

  y_tr,
  epochs=4,
  batch_size=32, validation_split=0.2
)

# Evaluate the model.
model.evaluate( 
  X_val,
  y_val
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.0, 0.4944802224636078]

In [25]:
preds = model.predict(X_val)



In [26]:
preds.T

array([[3.9873029e-09, 2.3473656e-09, 3.2299463e-09, ..., 3.4095127e-09,
        3.4468091e-09, 3.0903893e-09]], dtype=float32)

In [27]:
y_val

0       0
1       0
2       1
3       0
4       1
       ..
2169    1
2170    1
2171    0
2172    1
2173    0
Name: Transported, Length: 2174, dtype: int32