In [107]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from skopt import gp_minimize

In [108]:
train = pd.read_csv('../data/titanic/train.csv')
test = pd.read_csv('../data/titanic/test.csv')

In [109]:
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [201]:
X_train = train.drop(['PassengerId', 'Survived'], axis=1)
X_test = test.drop(['PassengerId'], axis=1)

## Features

In [202]:
def create_features(X):
    subs = {'female': 1, 'male': 0}
    X['Woman'] = X['Sex'].replace(subs)

    X['Age'] = X['Age'].fillna(X['Age'].mean())

    X['Fare'] = X['Fare'].fillna(X['Fare'].mean())

    X['Embarked'] = X['Embarked'].fillna('S')

    subs = {'S': 1, 'C': 2, 'Q':3}
    X['Harbor'] = X['Embarked'].replace(subs)

    def isVip(fare):
        if int(fare) >= 20:
            return 1
        return 0
    
    X['Vip'] = X['Fare'].apply(isVip)

    X['Kid'] = 1
    X['Kid'] = np.where(X['Age'] < 12, 1, 0)

    def getNameId(name):
        name_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 6, 'Major': 7, 'Col': 7, 'Mlle': 8, 'Mme': 8, 'Don': 9, 'Lady': 10, 'Countess': 10, 'Jonkheer': 10, 'Sir': 9, 'Capt': 7, 'Ms': 2}

        for n in name.split(' '):
            for key, value in name_mapping.items():
                if key in n:
                    return value
        return 0

    X['Title'] = X['Name'].apply(getNameId)

    return X
    
X_train = create_features(X_train)
X_test = create_features(X_test)

  X['Woman'] = X['Sex'].replace(subs)
  X['Harbor'] = X['Embarked'].replace(subs)
  X['Woman'] = X['Sex'].replace(subs)
  X['Harbor'] = X['Embarked'].replace(subs)


In [203]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Woman', 'Harbor', 'Kid', 'Title', 'Vip']

X_train = X_train[features]
X_test = X_test[features]

y_train = train['Survived']

## Padronizando os dados

In [204]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Aplicando o Random Forest

In [205]:
model = RandomForestClassifier(criterion='entropy', n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1, random_state=0)

score = cross_val_score(model, X_train, y_train, cv=10)

score = np.mean(score)

f'Model RF score: {score.round(4)}%'

'Model RF score: 0.8429%'

## Modelo final

In [190]:
model.fit(X_train, y_train)

y_pred = model.predict(X_train)

ms = confusion_matrix(y_train, y_pred)

print(f'Matriz de confusão: {ms}')

score = model.score(X_train, y_train)

print(f'Score: {score}')

Matriz de confusão: [[538  11]
 [ 50 292]]
Score: 0.9315375982042648


In [191]:
y_pred = model.predict(X_test)

submission = pd.DataFrame(test['PassengerId'])
submission['Survived'] = y_pred

submission.to_csv('submission3.csv', index=False)