## Machine Learning
### Regressão Logística
#### Prof. Neylson Crepalde
#### Lucas Cesar Fernandes Ferreira

### ------------------------------------------

#### Vamos estimar modelos de regressão logística para entender as chances de sobrevivência no Titanic a partir de atributos disponíveis.

In [14]:
# Importando os pacotes necessários
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from statsmodels.discrete.discrete_model import Logit
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [40]:
bd = pd.read_csv('https://raw.githubusercontent.com/rebeccabilbro/titanic/master/data/train.csv')
bd.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
bd.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [41]:
bd['Pclass'] = bd['Pclass'].astype('category')

In [42]:
bd.dtypes

PassengerId       int64
Survived          int64
Pclass         category
Name             object
Sex              object
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked         object
dtype: object

In [43]:
# Preprocessing
bd1 = pd.get_dummies(bd, drop_first=True)
bd1.columns

Index(['PassengerId', 'Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_2',
       'Pclass_3', 'Name_Abbott, Mr. Rossmore Edward',
       'Name_Abbott, Mrs. Stanton (Rosa Hunt)',
       ...
       'Cabin_F G63', 'Cabin_F G73', 'Cabin_F2', 'Cabin_F33', 'Cabin_F38',
       'Cabin_F4', 'Cabin_G6', 'Cabin_T', 'Embarked_Q', 'Embarked_S'],
      dtype='object', length=1727)

In [44]:
bd1 = bd1[['Age', 'Sex_male', 'Pclass_2', 'Pclass_3', 'Survived']]
# Removendo os NAs
bd1 = bd1.dropna()
bd1.shape

(714, 5)

In [52]:
# Criando as matrizes para a estimação
# Y = Survived
# X = Age, Sex, Pclass
y = bd1.Survived
X = bd1[['Age', 'Sex_male', 'Pclass_2', 'Pclass_3']]
X.shape

(714, 4)

In [53]:
X['intercept'] = 1
X.shape

(714, 5)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

In [55]:
model_v1 = Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.462052
         Iterations 6


In [56]:
print(model_v1.summary2())

                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.320     
Dependent Variable: Survived         AIC:              471.1283  
Date:               2019-03-25 20:17 BIC:              492.1913  
No. Observations:   499              Log-Likelihood:   -230.56   
Df Model:           4                LL-Null:          -339.28   
Df Residuals:       494              LLR p-value:      6.7113e-46
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     6.0000                                       
------------------------------------------------------------------
              Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
------------------------------------------------------------------
Age          -0.0316    0.0090   -3.5110  0.0004  -0.0492  -0.0139
Sex_male     -2.5629    0.2455  -10.4413  0.0000  -3.0440  -2.0818
Pclass_2     -1.2833    0.3271   -3.9229  0.0001  -1.9245  -0.6422
Pclass_3     -2.3819    0.3261

In [58]:
# Calcular as chances relativas
np.exp(model_v1.params)

Age           0.968918
Sex_male      0.077081
Pclass_2      0.277114
Pclass_3      0.092376
intercept    36.844502
dtype: float64

In [59]:
def get_oddsperc(x):
    res = (np.exp(x) - 1) * 100
    return(res)

In [60]:
get_oddsperc(model_v1.params)

Age            -3.108190
Sex_male      -92.291929
Pclass_2      -72.288630
Pclass_3      -90.762383
intercept    3584.450154
dtype: float64

In [66]:
# Predizendo a minha chance de sobrevivencia
lucas = pd.DataFrame({'Age':54, 'Sex_male':1, 'Pclass_2':0, 'Pclass_3':1, 'intercept':1}, index=[0])
print(model_v1.predict(lucas))

0    0.045514
dtype: float64
