In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [18]:
df = pd.read_csv("Titanic-Dataset.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


I would like to first perform a simple linear regression. To start, I will need to determine which feature to use.

PassengerID, Name, Ticket can be dropped first since they have very small influence on surviving. Also dropped  Cabin since it has many null values. 

In [20]:
df = df.drop(columns=["PassengerId", "Ticket", "Name", "Cabin"])


I will want to use Sex, Pclass, Age, Fare, and Embarked.

In [21]:
# performing one-hot encoding
df = pd.get_dummies(df, columns=["Sex", "Embarked"], drop_first=True)


In [22]:
#handle missing age
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Fare"] = df["Fare"].fillna(df["Fare"].median())

In [23]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [24]:
features = [
    "Pclass",
    "Age",
    "Fare",
    "Sex_male",
    "Embarked_Q",
    "Embarked_S"
]

X = df[features]
y = df["Survived"]


In [25]:
model = LogisticRegression(max_iter=1000)
model.fit(X, y)

pd.Series(model.coef_[0], index=X.columns).sort_values()

Sex_male     -2.488470
Pclass       -1.131345
Embarked_S   -0.489260
Age          -0.032808
Embarked_Q   -0.002370
Fare          0.000039
dtype: float64

In [26]:
model_2 = LogisticRegression(penalty="l1", solver="liblinear")
model_2.fit(X, y)

pd.Series(model_2.coef_[0], index=X.columns)

Pclass       -1.080030
Age          -0.030276
Fare          0.000548
Sex_male     -2.503122
Embarked_Q    0.000000
Embarked_S   -0.434744
dtype: float64

In [27]:
#Dropped Embarked_Q and Embarked_S due to l1
features = ["Sex_male", "Pclass", "Age", "Fare"]
X = df[features]
y = df["Survived"]

In [28]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [29]:

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        C=1.0
    ))
])

In [30]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 LogisticRegression(penalty='l1', solver='liblinear'))])

In [31]:
y_pred = pipe.predict(X_val)
accuracy_score(y_val, y_pred)

0.7988826815642458

In [32]:

confusion_matrix(y_val, y_pred)

array([[94, 16],
       [20, 49]])

In [33]:
y_prob = pipe.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_prob)

0.8391304347826087

In [34]:
coefs = pipe.named_steps["model"].coef_[0]
pd.Series(coefs, index=features).sort_values()

Sex_male   -1.243454
Pclass     -0.951269
Age        -0.433616
Fare        0.043914
dtype: float64

In [35]:
print(X.columns)

Index(['Sex_male', 'Pclass', 'Age', 'Fare'], dtype='object')
