In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier 
# Настройка визуализаций
%matplotlib inline
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Вывод всех столбцов в DataFrame
pd.set_option('display.max_columns', None)

# Игнорирование warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
df_train = pd.read_csv('C:/Users/Miguel/My_DS_Projects/Titanic-ML-Analysis/data/train.csv')
df_test = pd.read_csv('C:/Users/Miguel/My_DS_Projects/Titanic-ML-Analysis/data/test.csv')
test_original = df_test.copy()
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_y_train = df_train['Survived']

In [4]:
df_train = df_train.drop(['PassengerId','Survived','Name','Ticket','Cabin'],axis=1)
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch']
df_train = df_train.drop(['SibSp','Parch'], axis=1)
df_train['Age'] = df_train.groupby(['Sex', 'Pclass'])['Age'].transform(
    lambda x: x.fillna(x.median())
)
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0])

In [5]:
num_col = df_train.select_dtypes(include= ['float','int']).columns
cat_col = df_train.select_dtypes(include= ['object']).columns

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_train,df_y_train, test_size=0.2)

In [7]:
preproc = ColumnTransformer([
    ('Scaler', StandardScaler(),num_col),
    ('ohe', OneHotEncoder(drop = 'first'),cat_col)
])
pipeline = Pipeline([
    ('preproc', preproc),
    ('model', RandomForestClassifier(
        n_estimators= 100,  # count trees
        max_depth= 10, 
        min_samples_split= 20,
        min_samples_leaf= 10,
        random_state= 42
    ))
])

In [8]:
pipeline.fit(X_train,y_train)

In [9]:
pipeline.score(X_train,y_train)

0.8539325842696629

In [10]:
y_pred = pipeline.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       440
           1       0.89      0.71      0.79       272

    accuracy                           0.85       712
   macro avg       0.86      0.83      0.84       712
weighted avg       0.86      0.85      0.85       712



In [11]:
y_pred_test = pipeline.predict(X_test)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       109
           1       0.85      0.63      0.72        70

    accuracy                           0.81       179
   macro avg       0.82      0.78      0.79       179
weighted avg       0.82      0.81      0.80       179



In [12]:
df_test = df_test.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)

In [13]:
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']
df_test = df_test.drop(['SibSp','Parch'], axis=1)
df_test['Age'] = df_test.groupby(['Sex', 'Pclass'])['Age'].transform(
    lambda x: x.fillna(x.median())
)
df_test['Embarked'] = df_test['Embarked'].fillna(df_test['Embarked'].mode()[0])
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].median())

In [14]:
df_pred_rf = pipeline.predict(df_test)
print("Random Forest - предсказания для финального test.csv:")
print(f"Выжило: {df_pred_rf.sum()} пассажиров")
print(f"Погибло: {len(df_pred_rf) - df_pred_rf.sum()} пассажиров")
print(f"Процент выживших: {df_pred_rf.mean():.1%}")

Random Forest - предсказания для финального test.csv:
Выжило: 129 пассажиров
Погибло: 289 пассажиров
Процент выживших: 30.9%
