## Importing necessary libraries

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import metrics

## Importing dataset

In [11]:
# Read Train and Test datasets combined in the EDA, that already contain missing value imputations
raw_combined = pd.read_csv("../Data/raw_combined.csv", index_col = 0)

In [15]:
# Explore dataset
raw_combined[:5]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Train
0,22.0,NO,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,1
2,26.0,NO,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,1
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,1
4,35.0,NO,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,1


## Selecting predictors and target variable

In [18]:
# Select possible predictors and target variable
predictors = ["Age", "Cabin", "Embarked", "Fare", "Parch", "Pclass", "Sex", "SibSp"]
target = ["Survived"]

## Encoding categorical variables

In [20]:
# To feed variables to a Logistic Regression Model, we need to encode categorical variables
raw_combined[predictors].dtypes

Age         float64
Cabin        object
Embarked     object
Fare        float64
Parch         int64
Pclass        int64
Sex          object
SibSp         int64
dtype: object

In [21]:
# Generate dataframe only with predictors
label_df = raw_combined[predictors]

In [23]:
# Initialize label encoder
label_encoder = LabelEncoder()

In [24]:
# Encode only categorical variables
for i, col in enumerate(label_df):
    # If a given column is categorical
    if label_df[col].dtype == 'object':
        # Encode that column with fit_transform
        label_df[col] = label_encoder.fit_transform(label_df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
# Explore resulting df
label_df

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,Pclass,Sex,SibSp
0,22.0,185,3,7.2500,0,3,1,1
1,38.0,106,0,71.2833,0,1,0,1
2,26.0,185,3,7.9250,0,3,0,0
3,35.0,70,3,53.1000,0,1,0,1
4,35.0,185,3,8.0500,0,3,1,0
...,...,...,...,...,...,...,...,...
413,28.0,185,3,8.0500,0,3,1,0
414,39.0,64,0,108.9000,0,1,0,0
415,38.5,185,3,7.2500,0,3,1,0
416,28.0,185,3,8.0500,0,3,1,0


## Splitting into Train and Test

In [29]:
# Create new train and test sample, within Train sample
x_train, x_test, y_train, y_test = model_selection.train_test_split(label_df.loc[raw_combined["Train"] == 1,], 
                                                                    raw_combined.loc[raw_combined["Train"] == 1, "Survived"], 
                                                                    test_size = 0.2, 
                                                                    random_state = 232323)

## Modelling

In [31]:
predictors

['Age', 'Cabin', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp']

### Full Model

In [43]:
# Initialize Logistic Regression model
log_reg = smf.ols(formula = 'Survived ~ Age + Cabin + Embarked + Fare + Parch + Pclass + Sex + SibSp', 
                data = pd.concat([y_train, x_train], axis=1))
# Fit Logistic Regression
log_reg_fit = log_reg.fit()

In [44]:
print(log_reg_fit.summary())

                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.403
Model:                            OLS   Adj. R-squared:                  0.396
Method:                 Least Squares   F-statistic:                     59.20
Date:                Tue, 07 Apr 2020   Prob (F-statistic):           1.16e-73
Time:                        15:25:25   Log-Likelihood:                -316.80
No. Observations:                 712   AIC:                             651.6
Df Residuals:                     703   BIC:                             692.7
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3725      0.094     14.658      0.0

**Obervations:**
* This first, full model shows a very low R-squared (0.403)
* Several variables appear to be insignificant to the model (high p value), such as Cabin, Embarked, Fare, and Parch

#### Evaluating the model in Train Set

In [52]:
# Predict values on Train
y_hat_train = log_reg_fit.predict(x_train)
# Convert predictions into zeros and ones
y_hat_train[y_hat_train < 0.5] = 0
y_hat_train[y_hat_train >= 0.5] = 1
# Print classification Report for Train Set
print(metrics.classification_report(y_train, y_hat_train))

              precision    recall  f1-score   support

         0.0       0.82      0.86      0.84       432
         1.0       0.76      0.71      0.74       280

    accuracy                           0.80       712
   macro avg       0.79      0.78      0.79       712
weighted avg       0.80      0.80      0.80       712



#### Evaluating the model in Test

In [53]:
# Predict values on Test
y_hat_test = log_reg_fit.predict(x_test)
# Convert predictions into zeros and ones
y_hat_test[y_hat_test < 0.5] = 0
y_hat_test[y_hat_test >= 0.5] = 1
# Print classification Report for Test Set
print(metrics.classification_report(y_test, y_hat_test))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85       117
         1.0       0.71      0.71      0.71        62

    accuracy                           0.80       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



### Removing Cabin from predictors

In [57]:
# Initialize Logistic Regression model
log_reg = smf.ols(formula = 'Survived ~ Age + Embarked + Fare + Parch + Pclass + Sex + SibSp', 
                data = pd.concat([y_train, x_train], axis=1))
# Fit Logistic Regression
log_reg_fit = log_reg.fit()
print(log_reg_fit.summary())

                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.403
Model:                            OLS   Adj. R-squared:                  0.397
Method:                 Least Squares   F-statistic:                     67.75
Date:                Tue, 07 Apr 2020   Prob (F-statistic):           1.36e-74
Time:                        15:32:05   Log-Likelihood:                -316.81
No. Observations:                 712   AIC:                             649.6
Df Residuals:                     704   BIC:                             686.2
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3677      0.082     16.676      0.0

**Observations:**
* AIC dropped, meaning this is a better model than the last one
* But R squared is still very low

#### Evaluating the model in Train Set

In [58]:
# Predict values on Train
y_hat_train = log_reg_fit.predict(x_train)
# Convert predictions into zeros and ones
y_hat_train[y_hat_train < 0.5] = 0
y_hat_train[y_hat_train >= 0.5] = 1
# Print classification Report for Train Set
print(metrics.classification_report(y_train, y_hat_train))

              precision    recall  f1-score   support

         0.0       0.82      0.86      0.84       432
         1.0       0.76      0.71      0.74       280

    accuracy                           0.80       712
   macro avg       0.79      0.78      0.79       712
weighted avg       0.80      0.80      0.80       712



#### Evaluating the model in Test Set

In [59]:
# Predict values on Test
y_hat_test = log_reg_fit.predict(x_test)
# Convert predictions into zeros and ones
y_hat_test[y_hat_test < 0.5] = 0
y_hat_test[y_hat_test >= 0.5] = 1
# Print classification Report for Test Set
print(metrics.classification_report(y_test, y_hat_test))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85       117
         1.0       0.71      0.71      0.71        62

    accuracy                           0.80       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



### Removing Parch from predictor

In [60]:
# Initialize Logistic Regression model
log_reg = smf.ols(formula = 'Survived ~ Age + Embarked + Fare + Pclass + Sex + SibSp', 
                data = pd.concat([y_train, x_train], axis=1))
# Fit Logistic Regression
log_reg_fit = log_reg.fit()
print(log_reg_fit.summary())

                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.402
Model:                            OLS   Adj. R-squared:                  0.397
Method:                 Least Squares   F-statistic:                     79.13
Date:                Tue, 07 Apr 2020   Prob (F-statistic):           1.54e-75
Time:                        15:32:27   Log-Likelihood:                -316.86
No. Observations:                 712   AIC:                             647.7
Df Residuals:                     705   BIC:                             679.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3656      0.082     16.710      0.0

**Observations:**
* AIC dropped again, meaning this is a better model than the last one
* But R squared also decreased slightly

#### Evaluating the model in Train Set

In [61]:
# Predict values on Train
y_hat_train = log_reg_fit.predict(x_train)
# Convert predictions into zeros and ones
y_hat_train[y_hat_train < 0.5] = 0
y_hat_train[y_hat_train >= 0.5] = 1
# Print classification Report for Train Set
print(metrics.classification_report(y_train, y_hat_train))

              precision    recall  f1-score   support

         0.0       0.82      0.85      0.84       432
         1.0       0.76      0.71      0.74       280

    accuracy                           0.80       712
   macro avg       0.79      0.78      0.79       712
weighted avg       0.80      0.80      0.80       712



#### Evaluating the model in Test Set

In [63]:
# Predict values on Test
y_hat_test = log_reg_fit.predict(x_test)
# Convert predictions into zeros and ones
y_hat_test[y_hat_test < 0.5] = 0
y_hat_test[y_hat_test >= 0.5] = 1
# Print classification Report for Test Set
print(metrics.classification_report(y_test, y_hat_test))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85       117
         1.0       0.71      0.71      0.71        62

    accuracy                           0.80       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



### Removing Fare from predictors

In [65]:
# Initialize Logistic Regression model
log_reg = smf.ols(formula = 'Survived ~ Age + Embarked + Pclass + Sex + SibSp', 
                data = pd.concat([y_train, x_train], axis=1))
# Fit Logistic Regression
log_reg_fit = log_reg.fit()
print(log_reg_fit.summary())

                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.402
Model:                            OLS   Adj. R-squared:                  0.397
Method:                 Least Squares   F-statistic:                     94.81
Date:                Tue, 07 Apr 2020   Prob (F-statistic):           2.29e-76
Time:                        15:38:56   Log-Likelihood:                -317.29
No. Observations:                 712   AIC:                             646.6
Df Residuals:                     706   BIC:                             674.0
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.4063      0.069     20.480      0.0

**Observations:**
* AIC dropped again, meaning this is a better model than the last one
* But R squared also decreased slightly

#### Evaluating the model in Train

In [66]:
# Predict values on Train
y_hat_train = log_reg_fit.predict(x_train)
# Convert predictions into zeros and ones
y_hat_train[y_hat_train < 0.5] = 0
y_hat_train[y_hat_train >= 0.5] = 1
# Print classification Report for Train Set
print(metrics.classification_report(y_train, y_hat_train))

              precision    recall  f1-score   support

         0.0       0.82      0.86      0.84       432
         1.0       0.76      0.71      0.74       280

    accuracy                           0.80       712
   macro avg       0.79      0.78      0.79       712
weighted avg       0.80      0.80      0.80       712



### Evaluating the model in Test

In [68]:
# Predict values on Test
y_hat_test = log_reg_fit.predict(x_test)
# Convert predictions into zeros and ones
y_hat_test[y_hat_test < 0.5] = 0
y_hat_test[y_hat_test >= 0.5] = 1
# Print classification Report for Test Set
print(metrics.classification_report(y_test, y_hat_test))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85       117
         1.0       0.71      0.73      0.72        62

    accuracy                           0.80       179
   macro avg       0.78      0.79      0.78       179
weighted avg       0.81      0.80      0.80       179



### Removing Embarked from predictors

In [69]:
# Initialize Logistic Regression model
log_reg = smf.ols(formula = 'Survived ~ Age + Pclass + Sex + SibSp', 
                data = pd.concat([y_train, x_train], axis=1))
# Fit Logistic Regression
log_reg_fit = log_reg.fit()
print(log_reg_fit.summary())

                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.399
Model:                            OLS   Adj. R-squared:                  0.396
Method:                 Least Squares   F-statistic:                     117.3
Date:                Tue, 07 Apr 2020   Prob (F-statistic):           9.93e-77
Time:                        15:41:07   Log-Likelihood:                -318.92
No. Observations:                 712   AIC:                             647.8
Df Residuals:                     707   BIC:                             670.7
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3786      0.067     20.565      0.0

**Obervations:**
* AIC increased from the last model, meaning that even though Emarked is not significant at the 5% level, removing it from the model causes more "damage" than keeping it
* Additionally, R-squared dropped to 0.399

### Choosing the final model

In [71]:
# Initialize Logistic Regression model
log_reg = smf.ols(formula = 'Survived ~ Age + Embarked + Pclass + Sex + SibSp', 
                data = pd.concat([y_train, x_train], axis=1))
# Fit Logistic Regression
log_reg_fit = log_reg.fit()
print(log_reg_fit.summary())

                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.402
Model:                            OLS   Adj. R-squared:                  0.397
Method:                 Least Squares   F-statistic:                     94.81
Date:                Tue, 07 Apr 2020   Prob (F-statistic):           2.29e-76
Time:                        15:42:58   Log-Likelihood:                -317.29
No. Observations:                 712   AIC:                             646.6
Df Residuals:                     706   BIC:                             674.0
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.4063      0.069     20.480      0.0

### Final Model's preformance

In [72]:
# Predict values on Test
y_hat_test = log_reg_fit.predict(x_test)
# Convert predictions into zeros and ones
y_hat_test[y_hat_test < 0.5] = 0
y_hat_test[y_hat_test >= 0.5] = 1
# Print classification Report for Test Set
print(metrics.classification_report(y_test, y_hat_test))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85       117
         1.0       0.71      0.73      0.72        62

    accuracy                           0.80       179
   macro avg       0.78      0.79      0.78       179
weighted avg       0.81      0.80      0.80       179



In [73]:
metrics.accuracy_score(y_test, y_hat_test)

0.8044692737430168

## Submitting scores

In [88]:
# Extract predictors for Test Data
x = label_df.loc[raw_combined["Train"] == 0,] 

In [89]:
# Predict on Test Data
y_hat = log_reg_fit.predict(x)

In [90]:
# Convert predictions into zeros and ones
y_hat[y_hat < 0.5] = 0
y_hat[y_hat >= 0.5] = 1

In [92]:
# Convert into dataframe
y_hat = pd.DataFrame(y_hat, columns = ["Survived"])

In [97]:
# Add passenger information
result = pd.concat([raw_combined.loc[raw_combined["Train"] == 0, "PassengerId"], y_hat.astype(int)], axis = 1)

In [98]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [99]:
# Save file
result.to_csv("../Submissions/logistic_regression.csv", index = False)