In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
# Настройка визуализаций
%matplotlib inline
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Вывод всех столбцов в DataFrame
pd.set_option('display.max_columns', None)

# Игнорирование warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
df_train = pd.read_csv('C:/Users/Miguel/My_DS_Projects/Titanic-ML-Analysis/data/train.csv')
df_test = pd.read_csv('C:/Users/Miguel/My_DS_Projects/Titanic-ML-Analysis/data/test.csv')
test_original = df_test.copy()

In [3]:
df_train
# Надо дропнуть Имя (возможно потом вытащим статус мисс ,мисер и тд.) Номер пассажира, билет и кабину (много НаН значений)
# Так же порт посадки 2 НаН Не забудь про них

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
df_y_train = df_train['Survived']

In [5]:
df_train = df_train.drop(['PassengerId','Survived','Name','Ticket','Cabin'],axis=1)

In [6]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch']

In [7]:
df_train = df_train.drop(['SibSp','Parch'], axis=1)

In [8]:
df_train['Age'] = df_train.groupby(['Sex', 'Pclass'])['Age'].transform(
    lambda x: x.fillna(x.median())
)

In [9]:
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0])

In [10]:
num_col = df_train.select_dtypes(include= ['float','int']).columns
cat_col = df_train.select_dtypes(include= ['object']).columns

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_train,df_y_train, test_size=0.2)

In [12]:
preproc = ColumnTransformer([
    ('Scaler', StandardScaler(),num_col),
    ('ohe', OneHotEncoder(drop = 'first'),cat_col)
])
pipeline = Pipeline([
    ('preproc', preproc),
    ('model', LogisticRegression())
])

In [13]:
pipeline.fit(X_train,y_train)

In [14]:
pipeline.score(X_train,y_train)

0.8047752808988764

In [15]:
y_pred = pipeline.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       436
           1       0.76      0.72      0.74       276

    accuracy                           0.80       712
   macro avg       0.80      0.79      0.79       712
weighted avg       0.80      0.80      0.80       712



In [16]:
y_pred_test = pipeline.predict(X_test)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85       113
           1       0.78      0.68      0.73        66

    accuracy                           0.81       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.81      0.81      0.81       179



In [17]:
df_test = df_test.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)

In [18]:
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']
df_test = df_test.drop(['SibSp','Parch'], axis=1)
df_test['Age'] = df_test.groupby(['Sex', 'Pclass'])['Age'].transform(
    lambda x: x.fillna(x.median())
)
df_test['Embarked'] = df_test['Embarked'].fillna(df_test['Embarked'].mode()[0])

In [19]:
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].median())

In [21]:
# Предсказание на финальном test.csv
df_pred_final = pipeline.predict(df_test)
original_test_ids = test_original['PassengerId'].copy()
# Статистика предсказаний
print("Распределение предсказаний для финального test.csv:")
print(f"Выжило: {df_pred_final.sum()} пассажиров")
print(f"Погибло: {len(df_pred_final) - df_pred_final.sum()} пассажиров") 
print(f"Процент выживших: {df_pred_final.mean():.1%}")

# Для портфолио можно сохранить предсказания
final_predictions = pd.DataFrame({
    'PassengerId': original_test_ids,  # ID из исходного df_test
    'Survived': df_pred_final
})

Распределение предсказаний для финального test.csv:
Выжило: 156 пассажиров
Погибло: 262 пассажиров
Процент выживших: 37.3%
