In [99]:
#### Dependencies ####
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [100]:
#### Loading the data ####
train_set = pd.read_csv("Data/train.csv")
test_set = pd.read_csv("Data/test.csv")

In [101]:
#### Checking the data ####
train_set.describe()
train_set.info()
train_set.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [102]:
#### Getting rid of useless column ####
def drop_columns(data, columns):
    for column in columns:
        data.drop(column, axis=1, inplace=True)
    return data

train_set = drop_columns(train_set, ["PassengerId","Name", "Ticket", "Fare", "Cabin"])
test_set = drop_columns(test_set, ["PassengerId","Name", "Ticket", "Fare", "Cabin"])

In [103]:
#### Checking the data ####
def missing_report(dataset):
    for feature in dataset.columns:
        if np.dtype(dataset[feature]) == "object":
            # categorical feature
            misses = dataset[feature].isnull().sum() + dataset[feature].isna().sum() + dataset[feature].values.tolist().count("")
            if misses > 0:
                print(f"{feature} has {dataset[feature].isnull().sum()} missing values")
        else:
            # numerical feature
            misses = dataset[feature].isnull().sum() + dataset[feature].isna().sum()
            if misses > 0:
                print(f"{feature} has {dataset[feature].isnull().sum()} missing values")


print("Train report : ")
missing_report(train_set)

print("Test report : ")
missing_report(test_set)

# Age has missing values, we need to take care of it

Train report : 
Age has 177 missing values
Embarked has 2 missing values
Test report : 
Age has 86 missing values


In [104]:
#### Filling missing values ####
# For embarked, we will drop the missing values
train_set.dropna(subset=["Embarked"], inplace=True)

# For Age, we will perform a tracked imputation by filling the missing values with the median
def track_impute(dataset, feature):
    # index containing missing values
    missing_index = dataset[dataset[feature].isna()].index
    dataset[feature + 'Missing'] = False
    dataset.loc[missing_index, feature + 'Missing'] = True
    dataset[feature].fillna(dataset[feature].median(), inplace=True)
    return dataset

train_set = track_impute(train_set, "Age")
test_set = track_impute(test_set, "Age")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[feature].fillna(dataset[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[feature].fillna(dataset[feature].median(), inplace=True)


In [105]:
#### Performing AdaBoost ####
# Basic Classifier to compare with AdaBoost
print("Basic Classifier Accuracy : ",sum(train_set["Survived"] == 0)/len(train_set))

Basic Classifier Accuracy :  0.6175478065241845


In [106]:
#### Preprocessing the data ####
# We need to convert the categorical features to numerical, here label encoding is enough for 2 levels
def one_hot_encode(dataset, features):
    for feature in features:
        dataset = pd.concat([dataset, pd.get_dummies(dataset[feature], prefix=feature)], axis=1)
        dataset.drop(feature, axis=1, inplace=True)
    return dataset

# train_set = hot_one_encode(train_set,["Sex","AgeMissing"])
train_set = one_hot_encode(train_set,["Pclass","Sex","AgeMissing","Embarked"])
test_set = one_hot_encode(test_set,["Pclass","Sex","AgeMissing","Embarked"])

# We need to scale the numerical data to have a better performance
def scale_data(dataset, features):
    scaler = StandardScaler()
    dataset[features] = scaler.fit_transform(dataset[features])
    return dataset

train_set = scale_data(train_set, ["Age"])
test_set = scale_data(test_set, ["Age"])

In [107]:
train_set.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,AgeMissing_False,AgeMissing_True,Embarked_C,Embarked_Q,Embarked_S
0,0,-0.563674,1,0,False,False,True,False,True,True,False,False,False,True
1,1,0.669217,1,0,True,False,False,True,False,True,False,True,False,False
2,1,-0.255451,0,0,False,False,True,True,False,True,False,False,False,True
3,1,0.43805,1,0,True,False,False,True,False,True,False,False,False,True
4,0,0.43805,0,0,False,False,True,False,True,True,False,False,False,True


In [108]:
### Building the model, we want to observe some results so we split the training set before complete evaluation ###
abc = AdaBoostClassifier(n_estimators=100)
X= train_set.drop("Survived", axis=1)
y = train_set["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
abc_model = abc.fit(X_train, y_train)

In [109]:
#### Predicting the test set ####
y_preds = abc_model.predict(X_test)
print("Accuracy : ", metrics.accuracy_score(y_test, y_preds)) # this is better than the basic classifier

Accuracy :  0.8033707865168539


In [110]:
#### Tuning the model ####
# We will perform a grid search to find the best hyperparameters
# param_grid = {'learning_rate': np.linspace(0.1, 1, 20), 'n_estimators': np.arange(20, 200, 20)}
# grid_search = GridSearchCV(estimator=abc, param_grid=param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# Get best learning rate
# best_learning_rate = grid_search.best_params_['learning_rate']
# best_n_estimators = grid_search.best_params_['n_estimators']

In [111]:
#### Rebuilding the model with the best hyperparameters ####
# lr = best_learning_rate # 0.7157894736842105
# nb_est = best_n_estimators # 80
lr = 0.7157894736842105
nb_est = 80
abc_tuned = AdaBoostClassifier(n_estimators=nb_est, learning_rate=lr)
abc_tuned_model = abc_tuned.fit(X_train, y_train)
print("Accuracy : ", metrics.accuracy_score(y_test, abc_tuned_model.predict(X_test))) # this is better than the basic classifier

Accuracy :  0.8033707865168539


In [112]:
#### Fitting the model on the whole training set ####
abc_model = abc_tuned_model.fit(X, y)

In [113]:
#### Predicting the test set and render it to the Kaggle submission format ####
titanic_preds = abc_model.predict(test_set)
submission = pd.DataFrame({"PassengerId": range(892,1310), "Survived": titanic_preds})
submission_path = "Data/titanic_submission.csv"

# Save the CSV file to the repository
with open(submission_path, "w") as file:
    submission.to_csv(file, index=False)

print("finished")

finished
