In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.isotonic import IsotonicRegression
from sklearn.model_selection import KFold

In [3]:
data = pd.read_csv('train.csv')
X, y = data.drop('Survived', axis=1), data['Survived']

In [4]:
features_to_keep = ['SibSp', 'Pclass', 'Parch', 'Survived']
categorical_features = ['Embarked', 'Sex']
binned_features = ['Age', 'Fare']

class TicketCountTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

        
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X.groupby('Ticket')['Ticket'].transform('count').values.reshape(-1, 1)
        raise ValueError("No dataframe.")
        
    def get_feature_names_out(self, input_features=None):
        return np.array(['ticket_count'])


In [5]:
class IsotonicallyCalibratedModel(ClassifierMixin):
    def __init__(self, base_estimator=None, cv: int = 5):
        if base_estimator is None:
            raise ValueError('No base estimator set')
        self.cv = cv
        self.estimator = base_estimator
        self.pipelines = None
        self.cutoff = 0.5
        
    def fit(self, X=None, y=None):
        if not isinstance(X, pd.DataFrame): 
            raise ValueError('Ты недоумок1')
        if not isinstance(y, pd.DataFrame):
            raise ValueError('Ты недоумок2')
        self.classes_ = np.unique(y)

        if isinstance(self.cv, int):
            folds = KFold(n_splits=self.cv, shuffle=True).split(X, y)
        else:
            # Предполагаем, что cv - это итерируемый объект с индексами
            folds = self.cv
            
        self.pipelines = []
        for train_index, test_index in folds:
            estimator = clone(self.estimator)
            estimator.fit(X.iloc[train_index], y.iloc[train_index])
            
            # Получаем вероятности для положительного класса
            proba = estimator.predict_proba(X.iloc[test_index])
            proba = proba[:, 1]  # Берем вероятности положительного класса
            
            y_test = np.array(y.iloc[test_index]).reshape((-1, ))
            iso_reg = IsotonicRegression(out_of_bounds='clip').fit(proba, y_test)
            self.pipelines.append((estimator, iso_reg))
        self.is_fitted_ = True
        return self

    def predict_proba(self, X=None):
        preds = []
        for estimator, iso_reg in self.pipelines:
            proba = estimator.predict_proba(X)
            if proba.shape[1] == 2:
                proba = proba[:, 1]
            else:
                proba = proba[:, 0]
            pred = iso_reg.transform(proba)
            preds.append(pred)
            
        # Усредняем предсказания по всем пайплайнам
        trues = np.mean(preds, axis=0).astype(float)
        return (np.vstack([1 - trues, trues])).T
    
    def predict(self, X=None):
        return (self.predict_proba(X)[:, 1] > self.cutoff).astype(int)

In [10]:
class BayesianSurvivalClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, features_to_keep=None, categorical_features=None, 
                 binned_features=None, n_bins=5, network_structure=None):
        self.features_to_keep = features_to_keep or ['SibSp', 'Pclass', 'Parch']
        self.categorical_features = categorical_features or ['Embarked', 'Sex']
        self.binned_features = binned_features or ['Age', 'Fare']
        self.n_bins = n_bins
        self.network_structure = network_structure or [
            ('Pclass', 'Survived'),
            ('Sex', 'Survived'),
            ('Age', 'Survived'),
            ('Fare', 'Survived'),
            ('Embarked', 'Survived'),
            ('ticket_count', 'Survived'),
            ('SibSp', 'Survived'),
            ('Parch', 'Survived')
        ]
        self.preprocessor = self._create_preprocessor()
        self.model = None
        self.inference = None

    def _create_preprocessor(self):
            numeric_pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy='median'))
            ])
    
            categorical_pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OrdinalEncoder())
            ])
    
            binned_pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('binner', KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy='uniform'))
            ])
    
            return ColumnTransformer(
                transformers=[
                    ('binned', binned_pipeline, self.binned_features),
                    ('categorical', categorical_pipeline, self.categorical_features),
                    ('numeric', numeric_pipeline, self.features_to_keep),
                    ('ticket_count', TicketCountTransformer(), ['Ticket'])
                ],
                remainder='drop',
                verbose_feature_names_out=False
            )


    def fit(self, X, y):
        # Преобразуем признаки
        X_transformed = self.preprocessor.fit_transform(X)
        feature_names = self.preprocessor.get_feature_names_out()
        
        # Создаём DataFrame с преобразованными признаками
        df = pd.DataFrame(X_transformed, columns=feature_names)
        df['Survived'] = y.values
        
        # Инициализируем и обучаем байесовскую сеть
        self.model = BayesianNetwork(self.network_structure)
        self.model.fit(df, estimator=BayesianEstimator, prior_type="dirichlet", pseudo_counts=3)
        
        # Инициализируем инструмент для вывода
        self.is_fitted_ = True
        self.inference = VariableElimination(self.model)
        return self

        
    def predict(self, X):
        # Преобразуем признаки
        X_transformed = self.preprocessor.transform(X)
        feature_names = self.preprocessor.get_feature_names_out()
        df = pd.DataFrame(X_transformed, columns=feature_names)
        # Выполняем предсказание для каждого образца
        predictions = []
        for _, row in df.iterrows():
            evidence = row.to_dict()
            query = self.inference.map_query(variables=['Survived'], evidence=evidence)
            predictions.append(query['Survived'])
        
        return np.array(predictions)

    
    def predict_proba(self, X):
        X_transformed = self.preprocessor.transform(X)
        feature_names = self.preprocessor.get_feature_names_out()
        df = pd.DataFrame(X_transformed, columns=feature_names)
        
        probas = []
        not_probas = []
        for _, row in df.iterrows():
            evidence = row.to_dict()
            try:
                posterior = self.inference.query(variables=['Survived'], evidence=evidence)
                probas.append(posterior.values[0])
                not_probas.append(posterior.values[1])
            except:
                probas.append(0)
                not_probas.append(1)
        return np.vstack([np.array(probas), np.array(not_probas)]).T


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
t = IsotonicallyCalibratedModel(base_estimator=BayesianSurvivalClassifier(), cv=5)
t.fit(X_train, pd.DataFrame({'Survived':y_train}))

<__main__.IsotonicallyCalibratedModel at 0x17849cd6580>

In [None]:
balanced_accuracy_score(t.predict(X_test), y_test)

0.7827284105131413

0.7827284105131413