In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from copy import copy as copy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

Data description:

    Survival - Survival (0 = No; 1 = Yes)
    Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
    Name - Name
    Sex - Sex
    Age - Age
    Sibsp - Number of Siblings/Spouses Aboard
    Parch - Number of Parents/Children Aboard
    Ticket - Ticket Number
    Fare - Passenger Fare
    Cabin - Cabin
    Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

In [2]:
# creating dataframe from csv-format data
df = pd.read_csv("titanic_data.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


We assum:

Variables of interest: X = ['Pclass', 'Sex', 'Age', 'Parch', 'Fare', 'Embarked']
Target variable to be predicted: y = ['Survived']

It is needed to modify some non-encoded values on the variables of interest;
1) For NaN values, whether changing or remove them.
2) For string values, create new ones with numerical values.


In [3]:
# Replace NaN values in the 'Age' column for the mean of age
mean_Age=df.Age.mean()
df.Age.fillna(value=mean_Age, inplace=True)

In [4]:
# Create new columns for obtaining string- to numerical-values for 'Sex' and 'Embarked' columns
le = LabelEncoder()
df['Sex_idx'] = le.fit_transform(df["Sex"])
df['Embarked_idx'] = le.fit_transform(df["Embarked"])

In [5]:
# Determine the most frequent value (mode) in the 'Embarked_idx' columns
mode_Embarked_idx=int(df.Embarked_idx.mode().iloc[0])

# Replace value 3 in 'Embarked_idx' (NaN before in 'Embarked' column) for the Embarked_idx mode
df.Embarked_idx.replace(to_replace=3, value=int(mode_Embarked_idx), inplace=True)

Now, every variable of interest is correctly pre-procesed and ready to be used in the model to predict.

In [6]:
# See the modified dataframe
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_idx,Embarked_idx
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,0,2
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S,1,2
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,0,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,S,0,2
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,1,0


In [7]:
# Now, select these variables of interest for training
X = df[['Pclass', 'Age', 'Parch', 'Fare', 'Sex_idx', 'Embarked_idx']]
y = df[['Survived']]

# Select an aleatory subsample (90%) for training and (10%) for test the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [8]:
# Train the model
model = LogisticRegression()
model.fit(X_train, y_train.values.ravel())

In [9]:
# We can add the predicted values (by the model) on the dataframe which contains the *True* values, just for comparison.
y_test['Survived_pred'] = model.predict(X_test)
y_test

Unnamed: 0,Survived,Survived_pred
566,0,0
338,1,0
660,1,0
605,0,0
735,0,0
...,...,...
492,0,0
93,0,0
271,1,0
267,1,0


In [10]:
# Here, the score of the model (using the training sample)
model.score(X_train, y_train)

0.7927590511860175

In [11]:
# Calculate the score on the test sample.
score_test = len(y_test.query("Survived_pred == Survived"))/len(y_test)
score_test

0.8

The model is ~ 80% correct. Of course, these percentage change in every run due to the training sample (90%) is chosen randomly.