In [758]:
import sys

sys.path.insert(0, "../")
import helpers
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

##### 1. Selecting most optimal solution with lazypredict

In [759]:
# Set random seed to ensure reproducibility of results
seed = 47
helpers.set_random_seed(seed=seed)

In [760]:
# Load train and test data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [761]:
# Show first 5 rows of train data
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [762]:
# Check missing values in train data
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [763]:
# With help of lazypredict library evaluating work of different solutions
helpers.lazy_predict(
    train_df=train_df.drop(
        columns=["PassengerId", "Name", "Cabin", "Ticket", "Embarked"]
    )
)

  3%|▎         | 1/29 [00:00<00:06,  4.25it/s]

 21%|██        | 6/29 [00:00<00:01, 16.58it/s]

CategoricalNB model failed to execute
Negative values in data passed to CategoricalNB (input X)


 97%|█████████▋| 28/29 [00:01<00:00, 17.78it/s]

StackingClassifier model failed to execute
StackingClassifier.__init__() missing 1 required positional argument: 'estimators'
[LightGBM] [Info] Number of positive: 306, number of negative: 495
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 801, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382022 -> initscore=-0.480973
[LightGBM] [Info] Start training from score -0.480973


100%|██████████| 29/29 [00:01<00:00, 16.78it/s]






Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,0.87,0.87,0.87,0.87,0.02
LinearSVC,0.83,0.83,0.83,0.83,0.04
LinearDiscriminantAnalysis,0.83,0.83,0.83,0.83,0.02
CalibratedClassifierCV,0.83,0.83,0.83,0.83,0.07
RidgeClassifierCV,0.83,0.83,0.83,0.83,0.03
RidgeClassifier,0.83,0.83,0.83,0.83,0.04
NuSVC,0.83,0.83,0.83,0.83,0.04
SGDClassifier,0.82,0.82,0.82,0.82,0.02
RandomForestClassifier,0.82,0.82,0.82,0.82,0.27
AdaBoostClassifier,0.82,0.82,0.82,0.82,0.24


##### 2. Feature Engineering

In [764]:
# Display proportion of survived passengers considering their Pclass
train_df.groupby(["Pclass"]).Survived.mean().to_frame()
# Observation: Better Pclass results in higher survival rate

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.63
2,0.47
3,0.24


In [765]:
# Create a pivot table with Pclass as columns and Embarked as index
train_df.pivot_table(index="Embarked", columns="Pclass", aggfunc="size", fill_value=0)

Pclass,1,2,3
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,85,17,66
Q,2,3,72
S,127,164,353


In [766]:
# Display proportion of survived passengers considering their Embarked place
train_df.groupby(["Embarked"]).Survived.mean().to_frame()
# Observation: Being Embarked at point C results in the highest chances of survival. However, an assumption could be made
# that this is happening because of an increased number of passengers in the first Pclass.
# Therefore, the Embarked column will be skipped.

Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
C,0.55
Q,0.39
S,0.34


In [767]:
# Define and apply function checking if ticket contains letters
get_ticket_prefix_func: callable = lambda x: (
    "With_Letters" if not x.isnumeric() else "Just_Number"
)
train_df["Ticket_Type"] = train_df["Ticket"].apply(get_ticket_prefix_func)
# Display proportion of survived passengers considering their Ticket_Type
train_df.groupby(["Ticket_Type"]).Survived.mean()
# Observation: Seems like passengers with standard tickets containing only numbers have pretty much the
# same chance to survive as other passengers from crew or special groups having tickets with letters,
# so Ticket_Type column will not be used during training

Ticket_Type
Just_Number    0.38
With_Letters   0.38
Name: Survived, dtype: float64

In [768]:
# Define and apply function exctracting Title from Name
extract_title_func: callable = lambda x: x.split(",")[1].split(".")[0].strip()
train_df["Title"] = train_df["Name"].apply(extract_title_func)
test_df["Title"] = test_df["Name"].apply(extract_title_func)
# Display proportion of survived passengers considering their Title
train_df.groupby(["Title"]).Survived.mean().to_frame()
# Observation: For some people with certain Titles seems like there is really no chance for survival,
# at the same time some persons have surprisingly high chances to survive, so Name column will be transformed to
# column Person_Type based on Title and according to Age with Sex

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Capt,0.0
Col,0.5
Don,0.0
Dr,0.43
Jonkheer,0.0
Lady,1.0
Major,0.5
Master,0.57
Miss,0.7
Mlle,1.0


In [769]:
# Define mapping Title to Person_Type:
title_to_person_type = {
    **{
        key: "Adult Male"
        for key in [
            "Mr",
            "Dr",
            "Jonkheer",
            "Major",
            "Col",
            "Sir",
            "Capt",
            "Don",
            "Rev",
        ]
    },
    **{key: "Young Male" for key in ["Master"]},
    **{key: "Young Female" for key in ["Miss", "Mlle", "Ms"]},
    **{key: "Adult Female" for key in ["Mrs", "Mme", "Lady", "the Countess", "Dona"]},
}

# Apply mapping function for train and test data
train_df["Person_Type"] = train_df["Title"].map(title_to_person_type)
test_df["Person_Type"] = test_df["Title"].map(title_to_person_type)

In [770]:
# Display proportion of survived passengers considering newly created column Person_Type
train_df.groupby(["Person_Type"]).Survived.mean().to_frame()
# Observation: New column Person_Type has certain correlation with survival rate
# which will allow to skip column Sex and Age as well

Unnamed: 0_level_0,Survived
Person_Type,Unnamed: 1_level_1
Adult Female,0.8
Adult Male,0.16
Young Female,0.7
Young Male,0.57


In [771]:
# Display proportion of survived passengers considering SibSp - sum of siblings and spouces
train_df.groupby(["SibSp"]).Survived.mean().to_frame()

Unnamed: 0_level_0,Survived
SibSp,Unnamed: 1_level_1
0,0.35
1,0.54
2,0.46
3,0.25
4,0.17
5,0.0
8,0.0


In [772]:
# Display proportion of survived passengers considering Parch - sum of parents and childs
train_df.groupby(["Parch"]).Survived.mean().to_frame()

Unnamed: 0_level_0,Survived
Parch,Unnamed: 1_level_1
0,0.34
1,0.55
2,0.5
3,0.6
4,0.0
5,0.2
6,0.0


In [773]:
# Observation: SibSp and Parch columns have certain effect on survival rate, having few relatives
# increases survival rate, while having big family or being solo reduces chances for survival.
# So new column containing Family_Size will be created instead of SibSp and Parch
train_df["Family_Size"] = train_df["SibSp"] + train_df["Parch"] + 1
test_df["Family_Size"] = test_df["SibSp"] + test_df["Parch"] + 1
# Display proportion of survived passengers considering Family_Size
train_df.groupby(["Family_Size"]).Survived.mean().to_frame()

Unnamed: 0_level_0,Survived
Family_Size,Unnamed: 1_level_1
1,0.3
2,0.55
3,0.58
4,0.72
5,0.2
6,0.14
7,0.33
8,0.0
11,0.0


In [774]:
# Observation: Seems to be good to make 4 groups for Family_Size column based on similiar chances for survival
# with following sizes [1] - Solo, [2, 3] - Small, [4] - Medium, [5, 11] - Big
labels = ["Solo", "Small", "Medium", "Big"]
train_df["Family_Size"] = pd.cut(
    x=train_df.Family_Size, bins=[0, 1, 3, 4, 11], labels=labels
)
test_df["Family_Size"] = pd.cut(
    x=test_df.Family_Size, bins=[0, 1, 3, 4, 11], labels=labels
)
# Display proportion of survived passengers considering updated Family_Size
train_df.groupby(["Family_Size"]).Survived.mean().to_frame()

Unnamed: 0_level_0,Survived
Family_Size,Unnamed: 1_level_1
Solo,0.3
Small,0.56
Medium,0.72
Big,0.16


##### 3. Model Evalution

In [775]:
categorical_columns = ["Person_Type", "Family_Size"]
numerical_columns = ["Pclass"]
x_train = train_df[categorical_columns + numerical_columns]
y_train = train_df["Survived"]
x_train.head()

Unnamed: 0,Person_Type,Family_Size,Pclass
0,Adult Male,Small,3
1,Adult Female,Small,1
2,Young Female,Solo,3
3,Adult Female,Small,1
4,Adult Male,Solo,3


In [776]:
# One-hot encoding of categorical columns
x_train = pd.get_dummies(x_train, columns=categorical_columns)
x_train.head()

Unnamed: 0,Pclass,Person_Type_Adult Female,Person_Type_Adult Male,Person_Type_Young Female,Person_Type_Young Male,Family_Size_Solo,Family_Size_Small,Family_Size_Medium,Family_Size_Big
0,3,False,True,False,False,False,True,False,False
1,1,True,False,False,False,False,True,False,False
2,3,False,False,True,False,True,False,False,False
3,1,True,False,False,False,False,True,False,False
4,3,False,True,False,False,True,False,False,False


In [777]:
# Create validation data as part of train data
_, x_validation, _, y_validation = train_test_split(
    x_train, y_train, test_size=0.3, random_state=seed
)

In [778]:
# Initialize and train the Logistic Regression model
kwargs = {
    "C": 0.2,
    "penalty": "l1",
    "solver": "liblinear",
}
log_reg = LogisticRegression(**kwargs)
log_reg.fit(x_train, y_train)

In [779]:
# Make predictions for train data
y_pred = log_reg.predict(x_validation)
# Evaluate the model
accuracy = accuracy_score(y_validation, y_pred)
print(f"Accuracy: {accuracy:.4f}")
# Perform cross-validation
scores = cross_val_score(log_reg, x_validation, y_validation, cv=5)
print(f"Mean cross-validation score: {scores.mean():.4f}")

Accuracy: 0.8284
Mean cross-validation score: 0.8247


In [781]:
# Make predictions for test data
x_test = test_df[categorical_columns + numerical_columns]
x_test = pd.get_dummies(x_test, columns=categorical_columns)
y_pred = log_reg.predict(x_test)
y_pred = pd.DataFrame(y_pred, index=test_df.PassengerId, columns=["Survived"])
y_pred.to_csv("submission.csv")
y_pred

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0
