In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.linalg import svd
import matplotlib.patches as mpatches
from sklearn import model_selection

# Read the CSV file
titanic_data = pd.read_csv('../dataset/train.csv')

# Rename the data to a dataframe for semantics
df = titanic_data

# Take a quick look at the data again
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


From our previous analysis, the fare price and the age were the only continuous variables that we identified in the dataset, and looking at the head() again, we can clearly see that this is the case again.

In our analysis, fare price is of the greatest interest (dependent variable) as it varies greatly in addition to having a tremendous amount of outliers in the dataset in the previous analysis. Attributes of interest (independent variables) in this case would be passenger class (pclass), sex, age, sipsp (siblings), parents (parch) and embarked.

To predict fare, pClass is of interest because certain classes in the titanic had certain thresholds in price. 

Age too, as typically young people would not be able to afford the vast sums of fare prices as older people would've had their whole lives to earn their fortunes, although exceptions to this might be people coming of age, who would've had their tickets bought by their relatives. 

Sex could also influence the fare prices as historically women weren't breadwinners and their tickets would then be paid for by a husband who could afford such fees. 

Sibsp and Parch are also of interest due to siblings and parent child combinations indicating family sizes and maybe larger family groups could get family discounts or maybe parents would spend less per child if they had a lot of them to keep the trip affordable.

Finally we have embarked, because perhaps we have ports such as Queentown being more busy than Cherbourg, which increases competition for tickets onboard the titanic, thereby driving up the prices from those ports.

So what is the goal of this particular regression analysis? 

We simply want to predict future fare price values based on our independent variables mentioned above and we want to find the optimal regression model that generalizes the best to new instances of data.

In [23]:
# Remember how there were a lot of missing values in our previous analysis, let's check for that again
missing_age = df['Age'].isna().sum()
print(f"Missing values in 'Age' column: {missing_age}")

missing_pClass = df['Pclass'].isna().sum()
print(f"Missing values in 'Pclass' column: {missing_pClass}")

missing_sex = df['Sex'].isna().sum()
print(f"Missing values in 'Sex' column: {missing_sex}")

missing_embarked = df['Embarked'].isna().sum()
print(f"Missing values in 'Embarked' column: {missing_embarked}")

missing_parch = df['Parch'].isna().sum()
print(f"Missing values in 'Parch' column: {missing_parch}")

missing_sibSp = df['SibSp'].isna().sum()
print(f"Missing values in 'SibSp' column: {missing_sibSp}")

missing_fare = df['Fare'].isna().sum()
print(f"Missing values in 'Fare' column: {missing_fare}")

# First, let's drop these irrelevant columns, because they don't really add much value to our analysis
df_clean = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin','Survived'])

df_clean.head()


Missing values in 'Age' column: 177
Missing values in 'Pclass' column: 0
Missing values in 'Sex' column: 0
Missing values in 'Embarked' column: 2
Missing values in 'Parch' column: 0
Missing values in 'SibSp' column: 0
Missing values in 'Fare' column: 0


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [24]:
# We need to do something about the missing values, especially with age
# We can fill the missing values with the median age
df_clean['Age'] = df_clean['Age'].fillna(df['Age'].median())

# We can fill out the missing values in the 'Embarked' column with the most common value
# We justify this because categorical values aren't numerical and we want them to be numerical so that we
# can perform regression with them
df_clean['Embarked'] = df_clean['Embarked'].fillna(df['Embarked'].mode()[0])

In [25]:
# Now let's feature transform the columns with string values to numerical values using one-hot encoding
# One-hot encode with dropfirst so that we dont need example male and female columns, we can just use male with 1 if true and 0 if female
df_clean = pd.get_dummies(df_clean, columns=['Sex', 'Pclass', 'Embarked'], drop_first=True)

# Now let's see how our data looks like
df_clean.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,22.0,1,0,7.25,True,False,True,False,True
1,38.0,1,0,71.2833,False,False,False,False,False
2,26.0,0,0,7.925,False,False,True,False,True
3,35.0,1,0,53.1,False,False,False,False,True
4,35.0,0,0,8.05,True,False,True,False,True


In [26]:
# Now finally, let's standardize the data
# Standardize the data using the StandardScaler import, mathematically this is fairly simple, as it involves Z scoring the data
# with Z = (X - myu) / sigma.
scaler = StandardScaler()

# Fit the scaler to the data
df_clean = scaler.fit_transform(df_clean)


In [None]:
# Add offset column to allow for intercept in the model
# This is because the model is of the form y = Xw + b, where b is the intercept
# np.ones((df_clean.shape[0], 1)) creates a column of ones with the same number of rows as the data
# np.hstack() stacks the column of ones to the left of the data and the second argument df_clean
# tells us that this column of ones should be stacked to the left of df_clean
X = np.hstack([np.ones((df_clean.shape[0], 1)), df_clean])
# Fare column is our target variable
y = df['Fare'].values 

# Now let's set up cross validation with 10 folds
K = 10  # Number of folds
kf = model_selection.KFold(K, shuffle=True)

In [None]:
# Like in our examples that we've looked through, we'll keep the lambda values 10^-5 to 10^8
lambdas = np.power(10.0, range(-5, 9))