In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("Titanic.csv")

data_cleaned = data.dropna()

non_numeric_cols = data_cleaned.select_dtypes(include=['object']).columns
non_numeric_cols_to_drop = [col for col in non_numeric_cols if col not in ['Sex', 'Embarked']]
data_cleaned = data_cleaned.drop(columns=non_numeric_cols_to_drop)

data_cleaned['Sex'] = data_cleaned['Sex'].map({'male': 0, 'female': 1})
embarked_mapping = {'C': 1, 'Q': 2, 'S': 3}
data_cleaned['Embarked'] = data_cleaned['Embarked'].map(embarked_mapping)

if 'PassengerId' in data_cleaned.columns:
    data_cleaned = data_cleaned.drop(columns=['PassengerId'])

original_len = len(data)
cleaned_len = len(data_cleaned)
percent_lost = (original_len - cleaned_len) / original_len * 100

print(f"Процент потерянных данных: {percent_lost:.2f}%")

Процент потерянных данных: 79.46%


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = data_cleaned.drop(columns=['Survived'])
y = data_cleaned['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Точность модели: {accuracy:.4f}")

Точность модели: 0.8649


In [3]:
# Удалим Embarked и обучим модель заново
X_no_embarked = X.drop(columns=['Embarked'])
X_train_ne, X_test_ne, y_train_ne, y_test_ne = train_test_split(X_no_embarked, y, test_size=0.2, random_state=0)

model_no_embarked = LogisticRegression(max_iter=1000)
model_no_embarked.fit(X_train_ne, y_train_ne)
y_pred_ne = model_no_embarked.predict(X_test_ne)

accuracy_ne = accuracy_score(y_test_ne, y_pred_ne)
print(f"Точность без Embarked: {accuracy_ne:.4f}")
print(f"Разница в точности: {accuracy - accuracy_ne:.4f}")

Точность без Embarked: 0.8649
Разница в точности: 0.0000
