<h1 align="center">Predição de dados Titanic</h1>
<h3>Problema</h3>
<p>O problema consiste em identificar quantas pessoas sobreviveram ao naufragio do titanic fazendo previsoes e criar uma analise de dados para identificar todo o problema em volta do naufragio</p>
<br/>
<h3>Base de dados</h3>
<p>Os dados para analise sao os arquivos train.csv e test.csv</p>

In [375]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

In [377]:
#define variable and read data
train = pd.read_csv('./train.csv')

#select target variable
target = train['Survived']

#select predictors
train = train.drop(['Survived'], axis=1)

#transform sex male = 1 and female = 0
sex = pd.Series(np.where(train.Sex=='male', 1, 0), name='Sex')
train.drop(['Sex'], axis=1, inplace=True)
train = pd.concat([train, sex], axis=1)
#train = train.select_dtypes(exclude=['object'])
#train.isnull().sum()
#train.info()

In [379]:
#define train tes split
X_train, X_valid, y_train, y_valid = train_test_split(train, target, train_size=0.7, test_size=0.3, random_state=0)

In [381]:
# Select categorical columns
categorical = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == 'object']

# Select numerical columns
numerical = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [383]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical),
        ('cat', categorical_transformer, categorical)
        
    ])

In [385]:
model = RandomForestClassifier(n_estimators=160, random_state=200)
# define pipeline and cross value and RandomForestClassifier
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
#fit model
my_pipeline.fit(X_train, y_train)

# get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

cv_scores = cross_val_score(my_pipeline, X_train, y_train, cv=30, scoring='accuracy')
print("Cross validation accuracy: %f" % cv_scores.mean())

MAE: 0.26492537313432835
Cross validation accuracy: 0.711190


In [393]:
# new predict
titanic_pred = pd.read_csv('./test.csv')
result = my_pipeline.predict(titanic_pred)
result

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,