In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import re

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)


In [19]:
def get_title(name):
    title_list = ["Mr.", "Mrs.", "Miss.", "Master.", "Don.", "Rev.", "Dr.", "Mme.", "Ms.", 
        "Major.", "Lady.", "Sir.", "Mlle.", "Col.", "Capt.", "Countess.", "Jonkheer.", "Dona."]
    
    for title in title_list:
        if title in name:
            return title
    return None

In [20]:
def get_floor(cabin):
    """Returns a tuple of cabin floor and the number of the cabin"""
    if cabin == "null": return ["null",0]

    data = [re.findall("[a-zA-Z]+",cabin)[-1]]

    num = re.findall("\d+",cabin)
    if num:
        data.append(int(num[-1]))
    else:
        data.append(0)

    return data

    

In [21]:
def process(df):
    
    ## Filling null values. Starting with Cabin feature, then age and embarked
    df["Cabin"].fillna("null", inplace=True)

    ## Filled age with 0 as the coefficient multiplying with 0 is going to have no effect, so as
    ## to make it so the age is not known, it won't impact the predictions (it might though)
    df["Age"].fillna(0,inplace=True)
    df["Embarked"].fillna("null",inplace=True)
    df["Fare"].fillna(0,inplace=True)

    ## Adding features to split up cabin into discrete and continuous features
    result_series = df["Cabin"].apply(lambda cabin: get_floor(cabin))
    df["Cabin Floor"] = result_series.str[0]
    df["Cabin Room"] = result_series.str[1]
    
    ## Going to add a new feature to represent the title of people, there might be an 
    ## important detail to predict survavibility
    df["Title"] = df["Name"].apply(lambda name: get_title(name))

    ## Dropping name afterwards as it is not going to help us
    df.drop("Cabin", axis=1, inplace=True)
    df.drop("Name", axis=1, inplace=True)
    df.drop(["PassengerId", "Ticket"], axis=1, inplace=True)

    
    return df

In [22]:
df_train = process(df_train)
df_test = process(df_test)


In [23]:
df_train.loc[df_train.isnull().any(axis=1),:]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin Floor,Cabin Room,Title


In [24]:
y_train = df_train["Survived"]
x_train = df_train.drop("Survived",axis=1)
x_train = pd.get_dummies(x_train, dtype=int)

x_test = pd.get_dummies(df_test, dtype=int)
x_test


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin Room,Sex_female,Sex_male,Embarked_C,Embarked_Q,...,Cabin Floor_null,Title_Col.,Title_Dona.,Title_Dr.,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Ms.,Title_Rev.
0,3,34.5,0,0,7.8292,0,0,1,0,1,...,1,0,0,0,0,0,1,0,0,0
1,3,47.0,1,0,7.0000,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,2,62.0,0,0,9.6875,0,0,1,0,1,...,1,0,0,0,0,0,1,0,0,0
3,3,27.0,0,0,8.6625,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
4,3,22.0,1,1,12.2875,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0.0,0,0,8.0500,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
414,1,39.0,0,0,108.9000,105,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
415,3,38.5,0,0,7.2500,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
416,3,0.0,0,0,8.0500,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0


In [25]:
x_train.columns.difference(x_test.columns)

Index(['Cabin Floor_T', 'Embarked_null', 'Title_Capt.', 'Title_Countess.',
       'Title_Don.', 'Title_Jonkheer.', 'Title_Lady.', 'Title_Major.',
       'Title_Mlle.', 'Title_Mme.', 'Title_Sir.'],
      dtype='object')

In [29]:
x_train.


33858

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)
pred = model.predict(x_test)