In [None]:
import numpy as np
import pandas as pd 
import pandas as pd
import scipy.stats as stats
from sklearn.utils import resample
from sklearn.preprocessing import MultiLabelBinarizer

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
DOWNSAMPLING = False
CABIN = True
AGE = True
FARE = True


if DOWNSAMPLING:
    sur = df_train[df_train.Survived == 1]
    die = df_train[df_train.Survived == 0]
    min_size = min(len(sur), len(die))
    sur_down = resample(sur, n_samples=min_size, random_state=42)
    die_down = resample(die, n_samples=min_size, random_state=42)
    df_train = pd.concat([sur_down, die_down])
else:
    max_size = df_trian['Survived'].value_counts().max()
    lst = [df_train]
    for class_index, group in df_train.groupby('Survived'):
        lst.append(group.sample(max_size-len(group), replace=True))
    frame_new = pd.concat(lst)
    df_train = frame_new


removed = [
    'Name', 
    'Embarked', 
    'Ticket', 
    'SibSp',
    'Parch',
]

if CABIN:
    def extract_cabin_letters(cabin):
        if pd.isna(cabin):
            return []
        return [c[0] for c in cabin.split()]
    
    df_train['CabinList'] = df_train['Cabin'].apply(extract_cabin_letters)
    df_test['CabinList'] = df_test['Cabin'].apply(extract_cabin_letters)
    
    mlb = MultiLabelBinarizer()
    train_one_hot = pd.DataFrame(mlb.fit_transform(df_train['CabinList']), columns=mlb.classes_, index=df_train.index)
    test_one_hot = pd.DataFrame(mlb.transform(df_test['CabinList']), columns=mlb.classes_, index=df_test.index)
    
    df_train = pd.concat([df_train.drop(columns=['CabinList', 'Cabin', 'PassengerId']), train_one_hot], axis=1)
    df_test = pd.concat([df_test.drop(columns=['CabinList', 'Cabin']), test_one_hot], axis=1)
else:
    df_train.drop(['PassengerId'], axis=1, inplace=True) 
    removed.append('Cabin')
    

if AGE:
    age_arr = []
    for index, row in df_train.iterrows():
        if not pd.isna(row['Age']):
            age_arr.append(int(row['Age']))
    mean = sum(age_arr) / len(age_arr)

    def replace_age(age):
        if pd.isna(age):
            return mean 
        return age

    df_train['Age'] = df_train['Age'].apply(replace_age)
    df_test['Age'] = df_test['Age'].apply(replace_age)

if FARE:
    fare_arr = []
    for index, row in df_train.iterrows():
        if not pd.isna(row['Fare']):
            fare_arr.append(int(row['Fare']))
    mean = sum(fare_arr) / len(fare_arr)

    def replace_fare(fare):
        if pd.isna(fare):
            return mean 
        return fare

    df_train['Fare'] = df_train['Fare'].apply(replace_fare)
    df_test['Fare'] = df_test['Fare'].apply(replace_fare)
else:
    removed.append('Fare')


df_train.drop(removed, axis=1, inplace=True) 
df_test.drop(removed, axis=1, inplace=True) 


df_train.replace('male', 0, inplace=True)
df_train.replace('female', 1, inplace=True)
df_test.replace('male', 0, inplace=True)
df_test.replace('female', 1, inplace=True)

In [None]:
import tensorflow_decision_forests as tfdf
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

X = df_train.drop(['Survived'], axis=1)
y = df_train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train = pd.concat([X_train, y_train], axis=1) 
print(train.columns)
# clf = RandomForestClassifier(max_depth=2, random_state=0)
# clf = LogisticRegression(random_state=0, max_iter=1000)
# clf = svm.SVC()
clf = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, # Very few logs
    features=[tfdf.keras.FeatureUsage(name=n) for n in list(train.columns)],
    exclude_non_specified_features=True, # Only use the features in "features"
    random_seed=1234,
)

clf.fit(
    tfdf.keras.pd_dataframe_to_tf_dataset(
        train, 
        label="Survived")
)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
y_test = df_test['PassengerId']
X_test = df_test.drop('PassengerId', axis=1)
y_pred = clf.predict(X_test)
result_df = pd.DataFrame(y_pred)

result = pd.concat([y_test, result_df], axis=1)
result.rename(columns={0: "Survived"}, inplace=True)
result.to_csv('prediction.csv', index=False)