In [5]:
import pandas as pd
import requests

# !wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/m0c_logistic_regression.py

# Open our dataset from file
dataset = pd.read_csv("titanic.csv", index_col=False, sep=",", header=0)

# Fill missing cabin information with 'Unknown'
dataset["Cabin"].fillna("Unknown", inplace=True)

# Remove rows missing Age information
dataset.dropna(subset=["Age"], inplace=True)

# Remove the Name, PassengerId, and Ticket fields
# This is optional; it makes it easier to read our print-outs
dataset.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)

dataset.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset["Cabin"].fillna("Unknown", inplace=True)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,Unknown,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,Unknown,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,Unknown,S


In [6]:
# url = "https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/m0c_logistic_regression.py"
# response = requests.get(url)

# # Save the content to a file
# with open("logistic_regression.py", "w") as f:
#     f.write(response.text)

## About Our Model

We'll use a model training type known as Logistic Regression, which predicts who survived the Titanic disaster.


For this exercise, you don't need to understand logistic regression. We placed the implementation outside this notebook in a method called `train_logistic_regression`. If you're curious, you can read this method in our GitHub repository.

The `train_logistic_regression` method:

1. Accepts our data frame, and a list of features, to include in the model. 
2. Trains the model.
3. Returns a number that states how well the model performs as it predicts passenger survival. **Smaller numbers are better.**

## Numerical Only

Let's create a model that uses only the numerical features.

First, we'll use `Pclass` as an ordinal feature, rather than a one-hot encoded categorical feature.

In [9]:
from m0c_logistic_regression import train_logistic_regression

features = ["Age", "Pclass", "SibSp", "Parch", "Fare"] 
loss_numerical_only = train_logistic_regression(dataset, features)

print(f"Numerical-Only, Log-Loss (cost): {loss_numerical_only}")

Numerical-Only, Log-Loss (cost): 0.6121682789483452


We have our starting point. Let's see if categorical features will improve the model.

## Binary Categorical Features

Categorical features with only two potential values - for example, `0` or `1` - can be encoded in a single column.

We'll convert `Sex` values into `IsFemale` - a `0` for male and `1` for female - and include that in our model.

In [12]:
# Swap male / female with numerical values
# We can do this because there are only two categories
dataset["IsFemale"] = dataset.Sex.replace({'male':0, 'female':1})


# Run and test the model, also using IsFemale this time
features = ["Age", "Pclass", "SibSp", "Parch", "Fare", "IsFemale"] 
loss_binary_categoricals = train_logistic_regression(dataset, features)



  dataset["IsFemale"] = dataset.Sex.replace({'male':0, 'female':1})


In [13]:
print(f"\nNumerical + Sex, Log-Loss (cost): {loss_binary_categoricals}")


Numerical + Sex, Log-Loss (cost): 0.4707118586956134


Our loss (error) decreased! This model performs better than the previous model.

## One-Hot Encoding

Ticket class (`Pclass`) is an Ordinal feature. Its potential values (1, 2 & 3) have an order, and they have equal spacing. However, this even spacing might be incorrect. In stories about the Titanic, the third-class passengers were treated much worse than those in 1st and 2nd class.

Let's convert `Pclass` into a categorical feature with one-hot encoding:

In [15]:
# Get all possible categories for the "PClass" column
f"Possible values for PClass: {dataset['Pclass'].unique()}"



'Possible values for PClass: [3 1 2]'

In [20]:
# Use Pandas to One-Hot encode the PClass category
dataset_with_one_hot = pd.get_dummies(dataset, columns=["Pclass"], drop_first=False, dtype=int)

# Add back in the old Pclass column, for learning purposes
dataset_with_one_hot["Pclass"] = dataset["Pclass"]

# Print out the first few rows
dataset_with_one_hot.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,IsFemale,Pclass_1,Pclass_2,Pclass_3,Pclass
0,0,male,22.0,1,0,7.25,Unknown,S,0,0,0,1,3
1,1,female,38.0,1,0,71.2833,C85,C,1,1,0,0,1
2,1,female,26.0,0,0,7.925,Unknown,S,1,0,0,1,3
3,1,female,35.0,1,0,53.1,C123,S,1,1,0,0,1
4,0,male,35.0,0,0,8.05,Unknown,S,0,0,0,1,3


In [21]:
# Run and test the model, also using Pclass as a categorical feature this time
features = ["Age", "SibSp", "Parch", "Fare", "IsFemale",
            "Pclass_1", "Pclass_2", "Pclass_3"]

loss_pclass_categorical = train_logistic_regression(dataset_with_one_hot, features)



In [22]:
print(f"\nNumerical, Sex, Categorical Pclass, Log-Loss (cost): {loss_pclass_categorical}")


Numerical, Sex, Categorical Pclass, Log-Loss (cost): 0.4717353503703535


This seems to have made things slightly worse!

Let's move on.


## Including Cabin

Recall that many passengers had `Cabin` information. `Cabin` is a categorical feature and should be a good predictor of survival, because people in lower cabins probably had little time to escape during the sinking.

Let's encode cabin using one-hot vectors, and include it in a model. This time, there are so many cabins that we won't print them all out. To practice printing them out, feel free to edit the code for practice.

In [43]:
# Use Pandas to One-Hot encode the Cabin and Pclass categories
dataset_with_one_hot = pd.get_dummies(dataset, columns=["Pclass", "Cabin"], drop_first=False, dtype=int)

# Find cabin column names
cabin_column_names = list(c for c in dataset_with_one_hot.columns if c.startswith("Cabin_"))

# Print out how many cabins there were
print(len(cabin_column_names), "cabins found")



135 cabins found


In [44]:
cabin_column_names

['Cabin_A10',
 'Cabin_A16',
 'Cabin_A20',
 'Cabin_A23',
 'Cabin_A24',
 'Cabin_A26',
 'Cabin_A31',
 'Cabin_A34',
 'Cabin_A36',
 'Cabin_A5',
 'Cabin_A6',
 'Cabin_A7',
 'Cabin_B101',
 'Cabin_B18',
 'Cabin_B19',
 'Cabin_B20',
 'Cabin_B22',
 'Cabin_B28',
 'Cabin_B3',
 'Cabin_B30',
 'Cabin_B35',
 'Cabin_B37',
 'Cabin_B38',
 'Cabin_B39',
 'Cabin_B4',
 'Cabin_B41',
 'Cabin_B42',
 'Cabin_B49',
 'Cabin_B5',
 'Cabin_B50',
 'Cabin_B51 B53 B55',
 'Cabin_B57 B59 B63 B66',
 'Cabin_B58 B60',
 'Cabin_B69',
 'Cabin_B71',
 'Cabin_B73',
 'Cabin_B77',
 'Cabin_B79',
 'Cabin_B80',
 'Cabin_B82 B84',
 'Cabin_B86',
 'Cabin_B94',
 'Cabin_B96 B98',
 'Cabin_C101',
 'Cabin_C103',
 'Cabin_C104',
 'Cabin_C110',
 'Cabin_C111',
 'Cabin_C118',
 'Cabin_C123',
 'Cabin_C124',
 'Cabin_C125',
 'Cabin_C126',
 'Cabin_C148',
 'Cabin_C2',
 'Cabin_C22 C26',
 'Cabin_C23 C25 C27',
 'Cabin_C30',
 'Cabin_C32',
 'Cabin_C45',
 'Cabin_C46',
 'Cabin_C49',
 'Cabin_C50',
 'Cabin_C52',
 'Cabin_C54',
 'Cabin_C62 C64',
 'Cabin_C65',
 'Cabin_C

In [25]:
# Make a list of features
features = ["Age", "SibSp", "Parch", "Fare", "IsFemale",
            "Pclass_1", "Pclass_2", "Pclass_3"] + \
            cabin_column_names

# Run the model and print the result
loss_cabin_categorical = train_logistic_regression(dataset_with_one_hot, features)



In [26]:
print(f"\nNumerical, Sex, Categorical Pclass, Cabin, Log-Loss (cost): {loss_cabin_categorical}")


Numerical, Sex, Categorical Pclass, Cabin, Log-Loss (cost): 0.4600186822653977


## Improving Power

Including very large numbers of categorical classes (for example, 135 cabins) is often not the best way to train a model. This is because the model has only a few examples of each category class to learn from.

Sometimes, we can improve models if we simplify features. `Cabin` probably helped because it indicated the Titanic deck where people were probably located. Those in lower decks would have had their quarters flooded first. 

It might become simpler to use deck information, instead of categorizing people into Cabins. 


Let's simplify what we have run, replacing the 135 `Cabin` categories with a simpler `Deck` category that has only 9 values: A - G, T, and U (Unknown):


In [29]:
# We have cabin names, like A31, G45. The letter refers to the deck that
# the cabin was on. Extract just the deck and save it to a column. 
dataset["Deck"] = [c[0] for c in dataset.Cabin]


print("Decks: ", sorted(dataset.Deck.unique()))

Decks:  ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U']


In [33]:
# Create one-hot vectors for:
# Pclass - the class of ticket. (This could be treated as ordinal or categorical)
# Deck - the deck that the cabin was on
dataset_with_one_hot = pd.get_dummies(dataset, columns=["Pclass", "Deck"], drop_first=False)

# Find the deck names
deck_of_cabin_column_names = list(c for c in dataset_with_one_hot.columns if c.startswith("Deck_"))

In [34]:
deck_of_cabin_column_names

['Deck_A',
 'Deck_B',
 'Deck_C',
 'Deck_D',
 'Deck_E',
 'Deck_F',
 'Deck_G',
 'Deck_T',
 'Deck_U']

In [35]:
features = ["Age", "IsFemale", "SibSp", "Parch", "Fare", 
            "Pclass_1", "Pclass_2", "Pclass_3",
            "Deck_A", "Deck_B", "Deck_C", "Deck_D", 
            "Deck_E", "Deck_F", "Deck_G", "Deck_U", "Deck_T"]

loss_deck = train_logistic_regression(dataset_with_one_hot, features)

print(f"\nSimplifying Cabin Into Deck, Log-Loss (cost): {loss_deck}")


Simplifying Cabin Into Deck, Log-Loss (cost): 0.4588208415238718


In [36]:
# Use a dataframe to create a comparison table of metrics
# Copy metrics from previous Unit

l =[["Numeric Features Only", loss_numerical_only],
    ["Adding Sex as Binary", loss_binary_categoricals],
    ["Treating Pclass as Categorical", loss_pclass_categorical],
    ["Using Cabin as Categorical", loss_cabin_categorical],
    ["Using Deck rather than Cabin", loss_deck]]

pandas.DataFrame(l, columns=["Dataset", "Log-Loss (Low is better)"])

Unnamed: 0,Dataset,Log-Loss (Low is better)
0,Numeric Features Only,0.612168
1,Adding Sex as Binary,0.470712
2,Treating Pclass as Categorical,0.471735
3,Using Cabin as Categorical,0.460019
4,Using Deck rather than Cabin,0.458821
