In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from sklearn.decomposition import PCA
import joblib

# Initialize the data

In [9]:
def initialize_data():
    np.random.seed(101)


    # Initialize the data
    df = pd.read_csv("./titanic.csv")
    print(df["Fare"].min())
    print(df["Fare"].max())

    # df.set_index("PassengerId", inplace=True) # For debugging
    df.drop("PassengerId", axis=1, inplace=True)

    # Survived: Label
    # Pclass: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
    # Age: Age in years
    # Name: The name of the passenger
    # Sex: male/female
    # SibSp: no. of siblings / spouses aboard the Titanic
    # Parch: no. of parents / children aboard the Titanic
    # Ticket: Ticket number
    # Fare: Passenger fare
    # Cabin: Cabin number
    # Embarked: Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

    # Three columns have missing values: Age, Cabin, Embarked
    # Cabin has too many missing values to be useful, so we drop it -
    # df.drop("Cabin", axis=1, inplace=True)

    # Embarked has only 2 missing values so we can drop those rows
    df.dropna(subset=["Embarked"], inplace=True)

    # Age has 177 missing values, this is a lot, so we train a model to predict the age based on the other features
    # This model will be a simple linear regression model (see below)


    for row in df.itertuples():
        pass
        # Inspect the name column to extract the title of the passenger
        # This will be a new feature
        # name = row.Name
        # title = name.split(",")[1].split(".")[0].strip()
        # df.at[row.Index, "Title"] = title

        # Inspect the name column to extract the surname of the passenger
        # This will be a new feature
        # name = row.Name
        # surname = name.split(",")[0].strip()
        # df.at[row.Index, "Surname"] = surname

        # If the passenger has a cabin number, extract the cabin prefix
        # This will be a new feature
        # cabin = row.Cabin
        # if not pd.isnull(cabin):
        #     if len(cabin.split(" ")) > 1: # Some have multiple cabins => take the first one
        #         cabin = cabin.split(" ")[0]
        #         df.at[row.Index, "Cabin"] = 'Multiple Cabin'
        #     else:
        #         df.at[row.Index, "Cabin"] = 'Cabin'
            
        #     cabin_prefix = cabin[0]
        #     # df.at[row.Index, "CabinClass"] = cabin_prefix
        # else:
        #     # df.at[row.Index, "CabinClass"] = "XXX"
        #     df.at[row.Index, "Cabin"] = "No Cabin"

    # Now we can drop the Name column
    df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

    return df
    
df = initialize_data()

0.0
512.3292


## Fit an age prediciton model

In [3]:
def get_age_model(df):
  # Train a simple model to predict the age based on the other features
  # Use a random forest regressor to predict the age
  # Prepare the data
  dfAge = df.dropna(subset=["Age"])
  yage = dfAge["Age"]
  Xage = dfAge.drop(["Survived", "Age"], axis=1, inplace=True)

  # Convert the categorical features to numerical
  le = preprocessing.LabelEncoder()
  Xage = dfAge.apply(le.fit_transform)

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(Xage, yage, test_size = 0.25)

  # Initialize the random forest regressor
  rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

  # Train the model
  rf.fit(X_train, y_train)

  # Test the model
  predictions = rf.predict(X_test)
  errors = abs(predictions - y_test)
  print('Mean Absolute Error:', round(np.mean(errors), 2), 'years.')
  
  return rf

age_model = get_age_model(df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Mean Absolute Error: 10.13 years.


## Fill null values

In [4]:
def fix_null_values(df, age_model):
  # Replace the null values in the age column by predicting it using the random forest regressor
  # Prepare the data
  dfAge = df[df["Age"].isnull()]
  Xage = dfAge.drop(["Survived", "Age"], axis=1, inplace=True)

  # Convert the categorical features to numerical
  le = preprocessing.LabelEncoder()
  Xage = dfAge.apply(le.fit_transform)

  # Predict the age
  predictions = age_model.predict(Xage)

  # Replace the null values in the age column by the predicted values
  df.loc[df["Age"].isnull(), "Age"] = predictions

  # Round to the nearest integer
  df["Age"] = df["Age"].round().astype(int)

  # Train a kNN model to predict the survival of the passengers
  # Prepare the data
  y = df["Survived"]
  X = df.drop(["Survived"], axis=1)

  # Convert the categorical features to numerical
  le = preprocessing.LabelEncoder()
  X = df.apply(le.fit_transform)

  # Apply PCA to reduce the number of features
  # pca = PCA(n_components=6)
  # X = pca.fit_transform(X)

  return X, y

X, y = fix_null_values(df, age_model)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [6]:
# Initialize the kNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)

# Test the model
predictions = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification report:")
print(classification_report(y_test, predictions))

Accuracy: 0.757847533632287
Confusion matrix:
[[122  27]
 [ 27  47]]
Classification report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       149
           1       0.64      0.64      0.64        74

    accuracy                           0.76       223
   macro avg       0.73      0.73      0.73       223
weighted avg       0.76      0.76      0.76       223



In [7]:
# Fit a random forest classifier to the data
# Initialize the random forest classifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)

# Train the model
rf.fit(X_train, y_train)

# Test the model
predictions = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification report:")
print(classification_report(y_test, predictions))

Accuracy: 1.0
Confusion matrix:
[[149   0]
 [  0  74]]
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       149
           1       1.00      1.00      1.00        74

    accuracy                           1.00       223
   macro avg       1.00      1.00      1.00       223
weighted avg       1.00      1.00      1.00       223



# Save the features

In [8]:
# Convert the categorical features to numerical
# le = preprocessing.LabelEncoder()

# Convert the categorical features to numerical
# Convert male to 0 and female to 1

def sexToInt(x):
    if x == "male":
        return 0
    elif x == "female":
        return 1
    else:
      raise Exception("Unsupported sex value: " + x)

def embarkedToInt(x):
    if x == "S":
        return 0
    elif x == "C":
        return 1
    elif x == "Q":
        return 2
    else:
      raise Exception("Unsupported embarked value: " + x)

df["Sex"] =  df["Sex"].apply(sexToInt)
df["Embarked"] =  df["Embarked"].apply(embarkedToInt)
# df = df.apply(le.fit_transform)
# Save the data
df.columns= df.columns.str.lower()
df.to_csv("./titanicCleaned.csv", index= False)