In [1]:
!pip install --upgrade pip
# Install wittgenstein library
!pip install wittgenstein seaborn rulefit

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m83.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:03[0mm
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.2
    Uninstalling pip-24.2:
      Successfully uninstalled pip-24.2
Successfully installed pip-25.3


In [2]:
# Импорт необходимых библиотек
import pandas as pd
import numpy as np
import seaborn as sns
import wittgenstein as lw
from rulefit import RuleFit
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

# Загрузка датасета Titanic
df = sns.load_dataset('titanic')

# Удаление строк с пропущенными значениями
df = df.dropna()

del df['alive']

del df['alone']

# def df['adult_male']

def df['embark_town']

# Разделение признаков и целевой переменной
X = df.drop('survived', axis=1)
y = df['survived']

# Определение категориальных и числовых столбцов
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Преобразование категориальных признаков с помощью OneHotEncoder
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
    ],
    remainder='passthrough'  # Оставляем числовые столбцы без изменений
)

# Преобразование признаков
X_enc = column_transformer.fit_transform(X)
feature_names = column_transformer.get_feature_names_out()

# Преобразование в DataFrame
X_enc_df = pd.DataFrame(X_enc, columns=feature_names)

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    X_enc_df, y, test_size=0.2, stratify=y, random_state=42
)


SyntaxError: expected '(' (1448839717.py, line 23)

In [30]:
# Объединение признаков и целевой переменной для RIPPER
train_df = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

# Инициализация и обучение модели RIPPER
ripper_clf = lw.RIPPER(random_state=42)
ripper_clf.fit(train_df, class_feat='survived', pos_class=1)

# Предсказание на тестовой выборке
y_pred_ripper = ripper_clf.predict(test_df)

# Расчет метрик
auc_ripper = roc_auc_score(y_test, y_pred_ripper)
acc_ripper = accuracy_score(y_test, y_pred_ripper)
f1_ripper = f1_score(y_test, y_pred_ripper)

# Вывод правил
print("Правила RIPPER:\n")
print(ripper_clf.ruleset_)


Правила RIPPER:

[[cat__who_man=0.0^cat__deck_C=0.0^cat__class_Third=0.0] V [cat__who_man=0.0^remainder__parch=0]]


In [31]:
# Инициализация и обучение модели Дерева решений
dt_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_clf.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred_dt = dt_clf.predict(X_test)

# Расчет метрик
auc_dt = roc_auc_score(y_test, y_pred_dt)
acc_dt = accuracy_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

# Экспорт правил дерева решений
tree_rules = export_text(dt_clf, feature_names=list(X_train.columns))
print("Правила Дерева решений:\n")
print(tree_rules)


Правила Дерева решений:

|--- remainder__adult_male <= 0.50
|   |--- remainder__fare <= 10.48
|   |   |--- class: 0
|   |--- remainder__fare >  10.48
|   |   |--- remainder__fare <= 11.49
|   |   |   |--- class: 1
|   |   |--- remainder__fare >  11.49
|   |   |   |--- class: 1
|--- remainder__adult_male >  0.50
|   |--- remainder__age <= 43.50
|   |   |--- cat__deck_E <= 0.50
|   |   |   |--- class: 0
|   |   |--- cat__deck_E >  0.50
|   |   |   |--- class: 1
|   |--- remainder__age >  43.50
|   |   |--- remainder__age <= 47.50
|   |   |   |--- class: 0
|   |   |--- remainder__age >  47.50
|   |   |   |--- class: 0



In [32]:
# Инициализация и обучение модели RuleFit
rf_clf = RuleFit(
    tree_size=4,
    random_state=42,
    lin_standardise=False,
    #trim_quantile=0,
    rfmode='classify'
)
print()
rf_clf.fit(X_train.values, y_train.values, feature_names=X_train.columns)

# Предсказание на тестовой выборке
y_pred_rf = rf_clf.predict(X_test.values)

# Бинаризация предсказаний
y_pred_rf_binary = np.where(y_pred_rf > 0.5, 1, 0)

# Расчет метрик
auc_rf = roc_auc_score(y_test, y_pred_rf)
acc_rf = accuracy_score(y_test, y_pred_rf_binary)
f1_rf = f1_score(y_test, y_pred_rf_binary)

# Получение правил
rules = rf_clf.get_rules()
rules = rules[rules.coef != 0].sort_values(by="support", ascending=False)
print("Правила RuleFit:\n")
print(rules[['rule', 'coef', 'support']])




TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

In [34]:
X_train.values, y_train.values

(array([[1.0, 0.0, 1.0, ..., 0, 52.0, True],
        [0.0, 0.0, 1.0, ..., 0, 51.4792, False],
        [0.0, 0.0, 0.0, ..., 0, 76.7292, False],
        ...,
        [1.0, 0.0, 1.0, ..., 0, 38.5, True],
        [1.0, 0.0, 0.0, ..., 2, 110.8833, True],
        [0.0, 0.0, 0.0, ..., 1, 83.1583, False]], dtype=object),
 array([0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
        1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
        1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1]))

In [38]:
for [(list(X.values)[idx], idx) for idx, col_name in enumerate(X.columns)]

SyntaxError: cannot assign to list comprehension (2396622084.py, line 1)

In [41]:
X.values[0]

array([1, 'female', 38.0, 1, 0, 71.2833, 'C', 'First', 'woman', False,
       'C', 'Cherbourg'], dtype=object)

In [42]:
X.columns

Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class',
       'who', 'adult_male', 'deck', 'embark_town'],
      dtype='object')