In [75]:
import numpy as np
import pandas as pd 
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from warnings import simplefilter
simplefilter("ignore")

original_df_train = pd.read_csv('train.csv')
original_df_test = pd.read_csv('test.csv')

In [76]:
original_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
def train(original_df_train, original_df_test, DATA, CABIN, AGE, EMBARKED, SIBSP, PARCH, name, clf):
    df_train =  original_df_train.copy(deep=True)
    df_test = original_df_test.copy(deep=True)
    if DATA == 1: # Downsampling
        sur = df_train[df_train.Survived == 1]
        die = df_train[df_train.Survived == 0]
        min_size = min(len(sur), len(die))
        sur_down = resample(sur, n_samples=min_size, random_state=42)
        die_down = resample(die, n_samples=min_size, random_state=42)
        df_train = pd.concat([sur_down, die_down])
    elif DATA == 2: # Upsampling
        max_size = df_train['Survived'].value_counts().max()
        lst = [df_train]
        for class_index, group in df_train.groupby('Survived'):
            lst.append(group.sample(max_size-len(group), replace=True))
        frame_new = pd.concat(lst)
        df_train = frame_new
    else: # Inbalanced Data
        pass

    removed = [
        'Name', 
        'Ticket', 
    ]

    if not SIBSP:
        removed.append('SibSp')

    if not PARCH:
        removed.append('Parch')

    if CABIN:
        def extract_cabin_letters(cabin):
            if pd.isna(cabin):
                return []
            return [c[0] for c in cabin.split()]
        
        df_train['CabinList'] = df_train['Cabin'].apply(extract_cabin_letters)
        df_test['CabinList'] = df_test['Cabin'].apply(extract_cabin_letters)
        
        mlb = MultiLabelBinarizer()
        train_one_hot = pd.DataFrame(mlb.fit_transform(df_train['CabinList']), columns=mlb.classes_, index=df_train.index)
        test_one_hot = pd.DataFrame(mlb.transform(df_test['CabinList']), columns=mlb.classes_, index=df_test.index)
        
        df_train = pd.concat([df_train.drop(columns=['CabinList', 'Cabin']), train_one_hot], axis=1)
        df_test = pd.concat([df_test.drop(columns=['CabinList', 'Cabin']), test_one_hot], axis=1)
    else:
        removed.append('Cabin')
        

    if EMBARKED:
        df_train.replace('S', 0, inplace=True)
        df_train.replace('C', 1, inplace=True)
        df_train.replace('Q', 2, inplace=True)
        df_test.replace('S', 0, inplace=True)
        df_test.replace('C', 1, inplace=True)
        df_test.replace('Q', 3, inplace=True)
    else:
        removed.append('Embarked')

    if AGE:
        df_train['Title'] = df_train['Name'].str.extract(' ([A-Z,a-z]+)\. ', expand=False)
        df_test['Title'] = df_test['Name'].str.extract(' ([A-Z,a-z]+)\. ', expand=False)
        title_age_median = df_train.groupby('Title')['Age'].median()
        df_test['Age'] = df_test.apply(
            lambda row: title_age_median[row['Title']] if pd.isna(row['Age']) and row['Title'] in title_age_median else row['Age'],
            axis=1
        )
        df_train['Age'] = df_train.apply(
            lambda row: title_age_median[row['Title']] if pd.isna(row['Age']) and row['Title'] in title_age_median else row['Age'],
            axis=1
        )
        removed.append('Title')

    df_train.drop(removed, axis=1, inplace=True) 
    df_test.drop(removed, axis=1, inplace=True) 

    df_train.replace('male', 0, inplace=True)
    df_train.replace('female', 1, inplace=True)
    df_test.replace('male', 0, inplace=True)
    df_test.replace('female', 1, inplace=True)

    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    imp.fit(df_train.drop(['Survived', 'PassengerId'], axis=1))
    X = imp.transform(df_train.drop(['Survived', 'PassengerId'], axis=1))
    y = df_train['Survived']

    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    imp.fit(df_test.drop(['PassengerId'], axis=1))
    X_submission = imp.transform(df_test.drop(['PassengerId'], axis=1))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    result = {
        "dataConfig": f"DATA:{DATA} CABIN:{CABIN} AGE:{AGE} EMBARKED:{EMBARKED} SIBSP:{SIBSP} PARCH:{PARCH}",
        "name": name,
        "score": accuracy_score(y_test, y_pred),
        "model": clf,
    }
    return result

In [None]:
def predict(original_df_train, original_df_test, DATA, CABIN, AGE, EMBARKED, SIBSP, PARCH, name, clf):
    df_train =  original_df_train.copy(deep=True)
    df_test = original_df_test.copy(deep=True)
    if DATA == 1: # Downsampling
        sur = df_train[df_train.Survived == 1]
        die = df_train[df_train.Survived == 0]
        min_size = min(len(sur), len(die))
        sur_down = resample(sur, n_samples=min_size, random_state=42)
        die_down = resample(die, n_samples=min_size, random_state=42)
        df_train = pd.concat([sur_down, die_down])
    elif DATA == 2: # Upsampling
        max_size = df_train['Survived'].value_counts().max()
        lst = [df_train]
        for class_index, group in df_train.groupby('Survived'):
            lst.append(group.sample(max_size-len(group), replace=True))
        frame_new = pd.concat(lst)
        df_train = frame_new
    else: # Inbalanced Data
        pass

    removed = [
        'Name', 
        'Ticket', 
    ]

    if not SIBSP:
        removed.append('SibSp')

    if not PARCH:
        removed.append('Parch')

    if CABIN:
        def extract_cabin_letters(cabin):
            if pd.isna(cabin):
                return []
            return [c[0] for c in cabin.split()]
        
        df_train['CabinList'] = df_train['Cabin'].apply(extract_cabin_letters)
        df_test['CabinList'] = df_test['Cabin'].apply(extract_cabin_letters)
        
        mlb = MultiLabelBinarizer()
        train_one_hot = pd.DataFrame(mlb.fit_transform(df_train['CabinList']), columns=mlb.classes_, index=df_train.index)
        test_one_hot = pd.DataFrame(mlb.transform(df_test['CabinList']), columns=mlb.classes_, index=df_test.index)
        
        df_train = pd.concat([df_train.drop(columns=['CabinList', 'Cabin']), train_one_hot], axis=1)
        df_test = pd.concat([df_test.drop(columns=['CabinList', 'Cabin']), test_one_hot], axis=1)
    else:
        removed.append('Cabin')
        

    if EMBARKED:
        df_train.replace('S', 0, inplace=True)
        df_train.replace('C', 1, inplace=True)
        df_train.replace('Q', 2, inplace=True)
        df_test.replace('S', 0, inplace=True)
        df_test.replace('C', 1, inplace=True)
        df_test.replace('Q', 3, inplace=True)
    else:
        removed.append('Embarked')

    if AGE:
        df_train['Title'] = df_train['Name'].str.extract(' ([A-Z,a-z]+)\. ', expand=False)
        df_test['Title'] = df_test['Name'].str.extract(' ([A-Z,a-z]+)\. ', expand=False)
        title_age_median = df_train.groupby('Title')['Age'].median()
        df_test['Age'] = df_test.apply(
            lambda row: title_age_median[row['Title']] if pd.isna(row['Age']) and row['Title'] in title_age_median else row['Age'],
            axis=1
        )
        df_train['Age'] = df_train.apply(
            lambda row: title_age_median[row['Title']] if pd.isna(row['Age']) and row['Title'] in title_age_median else row['Age'],
            axis=1
        )
        removed.append('Title')

    df_train.drop(removed, axis=1, inplace=True) 
    df_test.drop(removed, axis=1, inplace=True) 

    df_train.replace('male', 0, inplace=True)
    df_train.replace('female', 1, inplace=True)
    df_test.replace('male', 0, inplace=True)
    df_test.replace('female', 1, inplace=True)

    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    imp.fit(df_train.drop(['Survived', 'PassengerId'], axis=1))
    X = imp.transform(df_train.drop(['Survived', 'PassengerId'], axis=1))
    y = df_train['Survived']

    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    imp.fit(df_test.drop(['PassengerId'], axis=1))
    X_submission = imp.transform(df_test.drop(['PassengerId'], axis=1))

    y_pred = clf.predict(X_submission)

    result = pd.DataFrame({
        "PassengerId": df_test['PassengerId'],
        "Survived": y_pred
    })
    result.to_csv('prediction.csv', index=False)

    return result

In [None]:
DATA = [0,1,2] 
CABIN = [True, False]
AGE = [True, False]
EMBARKED = [True, False]
SIBSP = [True, False]
PARCH = [True, False]
cls = {
        "HistGradientBoostingClassifier": HistGradientBoostingClassifier(),
        # "RandomForestClassifier": RandomForestClassifier(max_depth=2, random_state=0),
        # "LogisticRegression": LogisticRegression(random_state=0, max_iter=1000),
        # "SVC": svm.SVC(kernel='linear'),
    }

result_data = []
for data in DATA:
    for cabin in CABIN:
        for age in AGE:
            for embarked in EMBARKED:
                for sibsp in SIBSP:
                    for parch in PARCH:
                        for name, clf in cls.items():
                            print(data, cabin, age, embarked, sibsp, parch)
                            result_data.append(
                                train(original_df_train, original_df_test, data, cabin, age, embarked, sibsp, parch, "HistGradientBoostingClassifier", HistGradientBoostingClassifier())
                            )

# train(original_df_train, original_df_test, 1, True, True, True, True, True, "HistGradientBoostingClassifier", cls["HistGradientBoostingClassifier"])

0 True True True True True
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')
0 True True True True False
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare',
       'Embarked', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')
0 True True True False True
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'Parch', 'Fare',
       'Embarked', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')
0 True True True False False
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')
0 True True False True True
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')
0 True True False True False
Index(['PassengerId', 'Survived', 'Pclass', 'Sex'

In [79]:
# sns.heatmap(df_train.corr(), square=True)
# plt.show()

In [80]:
result_data.sort(key=lambda x: x["score"], reverse=True)
result_data[0:4]

[{'dataConfig': 'DATA:1 CABIN:True AGE:True EMBARKED:False SIBSP:True PARCH:True',
  'name': 'HistGradientBoostingClassifier',
  'score': 0.9051094890510949,
  'model': HistGradientBoostingClassifier()},
 {'dataConfig': 'DATA:2 CABIN:False AGE:False EMBARKED:False SIBSP:True PARCH:True',
  'name': 'HistGradientBoostingClassifier',
  'score': 0.9,
  'model': HistGradientBoostingClassifier()},
 {'dataConfig': 'DATA:2 CABIN:False AGE:False EMBARKED:False SIBSP:True PARCH:False',
  'name': 'HistGradientBoostingClassifier',
  'score': 0.9,
  'model': HistGradientBoostingClassifier()},
 {'dataConfig': 'DATA:1 CABIN:False AGE:True EMBARKED:False SIBSP:False PARCH:True',
  'name': 'HistGradientBoostingClassifier',
  'score': 0.8978102189781022,
  'model': HistGradientBoostingClassifier()}]

In [81]:
result_clf = []
cls = {
        "HistGradientBoostingClassifier": HistGradientBoostingClassifier(),
        "RandomForestClassifier": RandomForestClassifier(max_depth=2, random_state=0),
        "LogisticRegression": LogisticRegression(random_state=0, max_iter=1000),
        "SVC": svm.SVC(kernel='linear'),
    }
for name, clf in cls.items():
    print(name, clf)
    result_clf.append(
        train(original_df_train, original_df_test, 2, True, False, True, True, False, name, clf)
    )

HistGradientBoostingClassifier HistGradientBoostingClassifier()
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare',
       'Embarked', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')
RandomForestClassifier RandomForestClassifier(max_depth=2, random_state=0)
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare',
       'Embarked', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')
LogisticRegression LogisticRegression(max_iter=1000, random_state=0)
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare',
       'Embarked', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')
SVC SVC(kernel='linear')
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare',
       'Embarked', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')


KeyboardInterrupt: 

In [None]:
result_clf.sort(key=lambda x: x["score"], reverse=True)
result_clf[0:4]

[{'dataConfig': 'DATA:2 CABIN:True AGE:False EMBARKED:True SIBSP:True PARCH:False',
  'name': 'HistGradientBoostingClassifier',
  'score': 0.8727272727272727,
  'model': HistGradientBoostingClassifier()},
 {'dataConfig': 'DATA:2 CABIN:True AGE:False EMBARKED:True SIBSP:True PARCH:False',
  'name': 'RandomForestClassifier',
  'score': 0.8181818181818182,
  'model': RandomForestClassifier(max_depth=2, random_state=0)},
 {'dataConfig': 'DATA:2 CABIN:True AGE:False EMBARKED:True SIBSP:True PARCH:False',
  'name': 'SVC',
  'score': 0.8181818181818182,
  'model': SVC(kernel='linear')},
 {'dataConfig': 'DATA:2 CABIN:True AGE:False EMBARKED:True SIBSP:True PARCH:False',
  'name': 'LogisticRegression',
  'score': 0.8045454545454546,
  'model': LogisticRegression(max_iter=1000, random_state=0)}]

NameError: name 'X_submission' is not defined