# Novos Desafios de Machine Learning

## Carrega Bibliotecas

In [1]:
## importar as librarys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

## Carrega Dataset

In [2]:
df = pd.read_csv('titanic.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [4]:
df.nunique().sort_values()

Survived         2
Sex              2
Pclass           3
Embarked         3
SibSp            7
Parch            8
Cabin           76
Age             79
Fare           169
Ticket         363
PassengerId    418
Name           418
dtype: int64

Algumas variáveis não tem poder preditivo algum, são irrelevantes para a predição do modelo.

In [5]:
df = df.drop(
    columns=['Cabin',
            'Ticket',
            'PassengerId',
            'Name'])

In [6]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [7]:
## transformando uma coluna em dummy

df.Sex = df.Sex.map({'male':0, 'female':1})

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,34.5,0,0,7.8292,Q
1,1,3,1,47.0,1,0,7.0,S
2,0,2,0,62.0,0,0,9.6875,Q
3,0,3,0,27.0,0,0,8.6625,S
4,1,3,1,22.0,1,1,12.2875,S


In [8]:
df.isna().mean()

Survived    0.000000
Pclass      0.000000
Sex         0.000000
Age         0.205742
SibSp       0.000000
Parch       0.000000
Fare        0.002392
Embarked    0.000000
dtype: float64

In [9]:
df.Fare.value_counts(1)

7.7500     0.050360
26.0000    0.045564
13.0000    0.040767
8.0500     0.040767
7.8958     0.026379
             ...   
7.8208     0.002398
8.5167     0.002398
78.8500    0.002398
52.0000    0.002398
22.3583    0.002398
Name: Fare, Length: 169, dtype: float64

## Modelo de Machine Learning

In [10]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [11]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

target = ['Survived']

In [12]:
X = df[features]

y = df[target]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [14]:
categorical = ['Pclass', 'SibSp', 'Parch', 'Embarked']

numerical = ['Age', 'Sex', 'Fare']

In [15]:
categorial_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', ce.OneHotEncoder()),
])

numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

transformer = ColumnTransformer([
    ('categorical_transformer', categorial_pipe, categorical),
    ('numerical_transformer', numerical_pipe, numerical)
])

X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [16]:
logit = LogisticRegression()

In [17]:
logit.fit(X_train_transformed, y_train)

  y = column_or_1d(y, warn=True)


In [18]:
y_pred = logit.predict(X_test_transformed)

In [19]:
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1], dtype=int64)

In [20]:
y_test.values

array([[0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
    

In [21]:
from sklearn.metrics import(
    accuracy_score, 
    roc_auc_score, 
    f1_score, 
    precision_score,
    recall_score)

print(f'Acurácia: {accuracy_score(y_test, y_pred):.2f}')
print(f'ROC/AUC: {roc_auc_score(y_test, y_pred):.2f}')
print(f'F1-Score: {f1_score(y_test, y_pred):.2f}')
print(f'Precision: {precision_score(y_test, y_pred):.2f}')
print(f'Recall: {recall_score(y_test, y_pred):.2f}')

Acurácia: 1.00
ROC/AUC: 1.00
F1-Score: 1.00
Precision: 1.00
Recall: 1.00


Vamos lembrar de duas métricas:

Precision = TP/(TP+FP)

Recall = TP/(TP+FN)

## Checando a Probabilidade de Morte

In [22]:
## pegando os percentuais

logit.predict_proba(X_train_transformed)

array([[0.99206828, 0.00793172],
       [0.99128039, 0.00871961],
       [0.98974392, 0.01025608],
       [0.98884651, 0.01115349],
       [0.99212376, 0.00787624],
       [0.99252311, 0.00747689],
       [0.99089711, 0.00910289],
       [0.99020122, 0.00979878],
       [0.98978016, 0.01021984],
       [0.99250117, 0.00749883],
       [0.99251113, 0.00748887],
       [0.01554307, 0.98445693],
       [0.99198835, 0.00801165],
       [0.99098214, 0.00901786],
       [0.99026274, 0.00973726],
       [0.98969737, 0.01030263],
       [0.9925145 , 0.0074855 ],
       [0.99026274, 0.00973726],
       [0.99252702, 0.00747298],
       [0.9924974 , 0.0075026 ],
       [0.01968129, 0.98031871],
       [0.01849415, 0.98150585],
       [0.99251942, 0.00748058],
       [0.99211964, 0.00788036],
       [0.99211117, 0.00788883],
       [0.01740079, 0.98259921],
       [0.99250963, 0.00749037],
       [0.98966237, 0.01033763],
       [0.98940033, 0.01059967],
       [0.99183085, 0.00816915],
       [0.

Por definição, o Scikit-Learn considera que 50% a chance de morrer o valor mínimo, como nós queremos evitar morte a todo custo, então podemos alterar essa probabilidade padrão para quem sabe 30%.

In [25]:
y_pred = (logit.predict_proba(X_test_transformed)[:, 1] >= 0.3).astype('bool')

y_pred

array([False,  True, False, False,  True, False,  True, False,  True,
       False, False,  True,  True,  True, False, False,  True, False,
       False, False, False, False,  True, False,  True,  True,  True,
       False, False, False, False,  True,  True, False,  True, False,
        True, False,  True, False,  True,  True, False, False, False,
       False,  True,  True,  True, False, False,  True,  True, False,
       False,  True, False, False,  True, False, False, False,  True,
       False,  True, False, False, False, False, False, False, False,
       False,  True, False,  True, False,  True,  True, False, False,
        True, False,  True])

In [26]:
from sklearn.metrics import(
    accuracy_score, 
    roc_auc_score, 
    f1_score, 
    precision_score,
    recall_score)

print(f'Acurácia: {accuracy_score(y_test, y_pred):.2f}')
print(f'ROC/AUC: {roc_auc_score(y_test, y_pred):.2f}')
print(f'F1-Score: {f1_score(y_test, y_pred):.2f}')
print(f'Precision: {precision_score(y_test, y_pred):.2f}')
print(f'Recall: {recall_score(y_test, y_pred):.2f}')

Acurácia: 1.00
ROC/AUC: 1.00
F1-Score: 1.00
Precision: 1.00
Recall: 1.00
