![](https://img5tv.cdnvideo.ru/webp/shared/files/202104/1_1300734.jpg)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
titanic_train = pd.read_csv(r'/kaggle/input/titanic/train.csv')
titanic_test = pd.read_csv(r'/kaggle/input/titanic/test.csv')
y_test = pd.read_csv(r'/kaggle/input/titanic/gender_submission.csv')

## EDA

In [3]:
print((titanic_train.isna().mean() * 100).round(2))
print((titanic_test.isna().mean() * 100).round(2))

PassengerId     0.00
Survived        0.00
Pclass          0.00
Name            0.00
Sex             0.00
Age            19.87
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.00
Cabin          77.10
Embarked        0.22
dtype: float64
PassengerId     0.00
Pclass          0.00
Name            0.00
Sex             0.00
Age            20.57
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.24
Cabin          78.23
Embarked        0.00
dtype: float64


In [4]:
data = pd.concat([titanic_train, titanic_test], axis=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


## PREPARE DATA FOR MODELING 

In [5]:
print('Train:', data.groupby(['Sex', 'Pclass'])['Age'].median())
data['Age'] = data['Age'].fillna(data.groupby(['Sex', 'Pclass'])['Age'].transform('median'))

Train: Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
        3         25.0
Name: Age, dtype: float64


In [6]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
data['Fare'] = data['Fare'].fillna(data['Fare'].median())

In [7]:
(data.isna().mean() * 100).round(2)

PassengerId     0.00
Survived       31.93
Pclass          0.00
Name            0.00
Sex             0.00
Age             0.00
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.00
Cabin          77.46
Embarked        0.00
dtype: float64

Let's remove the feature of the passenger's cabin, since there are 77% of gaps in this column, it is difficult to imagine an analytical task in which the cabin number can be important. 77% of passes is a lot.

In [8]:
black_list = ['PassengerId', 'Survived', 'Cabin'] # delete this features 
dummies = ['Sex','Embarked', 'Pclass','SibSp','Parch']
continious = ['Age', 'Ticket', 'Fare','Name']
target = ['Survived']

In [9]:
data['Name'] = data['Name'].apply(lambda x: x.split(', ')[1].split('. ')[0])

In [10]:
data['Name'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

data['Ticket'] = le.fit_transform(data['Ticket'])
data['Name'] = le.fit_transform(data['Name'])

In [12]:
X = pd.concat([data[continious],pd.get_dummies(data[dummies])],axis=1)

In [13]:
X_train = X[:891]
X_test = X[891:]
y_train = data[target]['Survived'].dropna()
y_test = y_test['Survived']

## MODEL TRANING

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [15]:
params = {'n_estimators':[1,5,10,20,50,100],
          'max_depth':[5,10,15,20,30],
          'min_samples_leaf':[1,5,10,20],
          'random_state':[42]}

y_train = y_train.astype(int)
rf_grid = GridSearchCV(RandomForestClassifier(),param_grid=params,verbose=1,n_jobs=-1,cv=3)
rf_grid.fit(X_train,y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [5, 10, 15, 20, 30],
                         'min_samples_leaf': [1, 5, 10, 20],
                         'n_estimators': [1, 5, 10, 20, 50, 100],
                         'random_state': [42]},
             verbose=1)

In [16]:
from sklearn.linear_model import LogisticRegression
params = {'penalty':['l2'],
          'max_iter':[100,500,1000],
          'multi_class':['multinomial'],
          'n_jobs':[-1],
          'C':[0.01,0.1,0.5,1.0],
          'solver':['newton-cg','sag','saga','lbfgs']}
lr_grid = GridSearchCV(LogisticRegression(),param_grid=params,verbose=1,n_jobs=-1,cv=3)
lr_grid.fit(X_train,y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 0.5, 1.0],
                         'max_iter': [100, 500, 1000],
                         'multi_class': ['multinomial'], 'n_jobs': [-1],
                         'penalty': ['l2'],
                         'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']},
             verbose=1)

In [17]:
rf_pred = rf_grid.predict(X_test)
lr_pred = lr_grid.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score, f1_score,precision_score,recall_score
rf_pred = rf_grid.predict(X_test)
lr_pred = lr_grid.predict(X_test)
print (f"ACCURACY  : RF={accuracy_score(y_test,rf_pred):.3f}, LR={accuracy_score(y_test,lr_pred):.3f}")
print (f"F1-SCORE  : RF={f1_score(y_test,rf_pred,average='macro'):.3f}, LR={f1_score(y_test,lr_pred,average='macro'):.3f}")
print (f"PRECISION : RF={precision_score(y_test,rf_pred,average='macro'):.3f}, LR={precision_score(y_test,lr_pred,average='macro'):.3f}")
print (f"RECALL    : RF={recall_score(y_test,rf_pred,average='macro'):.3f}, LR={recall_score(y_test,lr_pred,average='macro'):.3f}")

ACCURACY  : RF=0.861, LR=0.967
F1-SCORE  : RF=0.848, LR=0.964
PRECISION : RF=0.853, LR=0.961
RECALL    : RF=0.844, LR=0.968


Logistic regression algorithm shows better perfomance accors all metrics => choose it

In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,lr_pred)

array([[256,  10],
       [  4, 148]])

In [20]:
lr_pred = lr_pred.astype(int)
accuracy_score(y_test, lr_pred)*100

96.65071770334929

### MODEL QUALITY - 96.65