In [4]:
import pandas as pd

df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Preprocess data

In [None]:
def preprocess(data: pd.DataFrame) -> tuple[pd.DataFrame, dict]:
    """Preprocess the data

    Args:
        data (pd.DataFrame): data to be preprocessed

    Returns:
        tuple[pd.DataFrame, dict]: preprocessed data and category mappings
    """
    _data = data.copy()

    # fill missing values
    _data['Age'] = _data['Age'].fillna(_data['Age'].mean())
    _data['Embarked'] = _data['Embarked'].fillna('S')
    
    # drop columns that are not useful for prediction or have too many missing values
    _data = _data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
    
    # encode categorical columns
    sex_mapping = {label: idx for idx, label in enumerate(_data["Sex"].unique())}
    embarked_mapping = {label: idx for idx, label in enumerate(_data["Embarked"].unique())}
    _data['Sex'] = _data['Sex'].map(sex_mapping)
    _data['Embarked'] = _data['Embarked'].map(embarked_mapping)

    return _data, {'sex': sex_mapping, 'embarked': embarked_mapping}

processed_df, category_mapping = preprocess(df)
print(category_mapping)
processed_df.head()

{'sex': {'male': 0, 'female': 1}, 'embarked': {'S': 0, 'C': 1, 'Q': 2}}


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.925,0
3,1,1,1,35.0,1,0,53.1,0
4,0,3,0,35.0,0,0,8.05,0


In [17]:
# save preprocessed data for reproducibility
processed_df.to_csv('data/train_processed.csv', index=False)

In [15]:
def get_features_and_target(data: pd.DataFrame) -> tuple:
    X = data.drop(columns=['Survived'])
    y = data['Survived']
    return X, y

X, y = get_features_and_target(processed_df)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,0
1,1,1,38.0,1,0,71.2833,1
2,3,1,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,3,0,35.0,0,0,8.05,0


In [18]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

SEED = 0
TEST_SIZE = 0.2

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)

## create dataset and train model

In [19]:
# create the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
# train the model
# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
    "num_leaves": 31,
    "seed": SEED,
}

model = lgb.train(params, train_data, valid_sets=[
                  test_data], num_boost_round=1000, 
                  callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[34]	valid_0's binary_logloss: 0.381405


In [30]:
# evaluate the model
y_pred = model.predict(X_test)
y_pred = [1 if x > 0.5 else 0 for x in y_pred]
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8324


## Hyperparameter tuning with GridSearchCV

In [37]:
from sklearn.model_selection import GridSearchCV

# https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
}

# Create the LightGBM model
lgb_model = lgb.LGBMClassifier(objective='binary', boosting_type='gbdt', seed=SEED)

grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
0.8342164877376146
{'learning_rate': 0.1, 'max_depth': 3, 'num_leaves': 31}


In [38]:
# evaluate the model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = [1 if x > 0.5 else 0 for x in y_pred]
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8436


## save model and mapping for inference

In [40]:
model.save_model('models/model_v0.txt')

import pickle

with open('models/category_mapping.pkl', 'wb') as f:
    pickle.dump(category_mapping, f)