In [1]:
import math
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Get data
df_train = pd.read_csv('titanic/train.csv')
df_test = pd.read_csv('titanic/test.csv')

# 1. Pre-processing

### Only include relevant columns
- Discarded PassengerId since it does not correlate with survival
- Discared Name since I don't believe someone's name would correlate with their survival
- Discarded Ticket because the data is messy, inconsistent, and incomplete
- Discared Cabin since the data is too sparse

### Remove rows where age does not exist
- Remove these rows because age is correlated with survival rate so we don't want to train on incomplete data samples

In [3]:
# remove columns with incomplete data for correlated features
df_train.drop(df_train[np.isnan(df_train['Age'])].index, inplace=True)

### Keep correlated numerical data
- Age is correlated with survival because children were prioritized for the lifeboats
- Fare is correlated with survival because wealthier individuals (who had a higher fare) were given priority on lifeboats

Additionally, we replace any missing values with the mean for that column

In [4]:
numerical_columns = ['Age', 'Fare']
numerical_data = df_train[numerical_columns]

# replace empty values with mean
for column in numerical_data:
    numerical_data[column].fillna(value=numerical_data[column].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_data[column].fillna(value=numerical_data[column].mean(), inplace=True)


### Convert numerical features to categorical features
- We convert Pclass, SibSp, and Parch to categorical features since they are discrete values that only assume a limited number of values.  When we convert these numerical features to categorical features and one-hot encode them, they contribute more value in the model training

In [5]:
# convert discrete numerical features to categorical features
numerical_to_categorical_cols = ['Pclass', 'SibSp', 'Parch']
for col in numerical_to_categorical_cols:
    df_train[col] = df_train[col].astype('category',copy=False)

### One-hot encode categorical features
In addition to the numberical features we converted to categorical features above, we one-hot encode other relevant categorical features:

- Sex: this is important since women were given priority on lifeboats
- Embarked: this might have determined where their room was located on the boat (might have filled the boat from bottom to top or vice versa) and thus determined how close they were to lifeboats

In [6]:
# separate out the categorical data and one-hot encode
categorical_columns = ['Sex', 'Embarked'] + numerical_to_categorical_cols
categorical_data = df_train[categorical_columns]
categorical_data = pd.get_dummies(categorical_data)  # one-hot encode

In [7]:
# combine numerical and categorical data
X_train = pd.concat([numerical_data, categorical_data], axis=1)

# 2. Logistic Regression

In [8]:
y_train = df_train['Survived']
log_reg = LogisticRegression().fit(X_train, y_train)
print(log_reg.score(X_train, y_train))

0.8179271708683473


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# 3. Test

In [9]:
numerical_columns = ['Age', 'Fare']
numerical_data = df_test[numerical_columns]

# replace empty values with mean
for column in numerical_data:
    numerical_data[column].fillna(value=numerical_data[column].mean(), inplace=True)
    
# convert discrete numerical features to categorical features
numerical_to_categorical_cols = ['Pclass', 'SibSp', 'Parch']
for col in numerical_to_categorical_cols:
    df_test[col] = df_test[col].astype('category', copy=False)
    
# separate out the categorical data and one-hot encode
categorical_columns = ['Sex', 'Embarked'] + numerical_to_categorical_cols
categorical_data = df_test[categorical_columns]
categorical_data = pd.get_dummies(categorical_data)  # one-hot encode

# combine numerical and categorical data
X_test = pd.concat([numerical_data, categorical_data], axis=1)

# Remove one-hot encoded columns in testing dataset that are not in training dataset
X_test.drop(columns=[col for col in X_test if col not in X_train], inplace=True)

y_pred = log_reg.predict(X_test)
y_pred_final = pd.DataFrame(data = y_pred, index = df_test['PassengerId'], columns = ['Survived'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_data[column].fillna(value=numerical_data[column].mean(), inplace=True)


In [10]:
# create CSV with predictions
y_pred_final.to_csv('titanic/predictions.csv')