In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("train.csv")

# Preprocessing

## Age

나이 결측치 Median으로 대체

In [3]:
df['Age'] = df['Age'].fillna(df['Age'].median())

## PassengerID

Passenger ID를 group과 group 내 ID로 분리

In [4]:
def split_passenger(X):
    X_ = X.copy()
    X_[['Passenger_group', 'Passenger_id_in_group']] = X_['PassengerId'].str.split('_', 1, expand=True)
    X_['Passenger_group'] = pd.to_numeric(X_['Passenger_group'])
    X_['Passenger_id_in_group'] = pd.to_numeric(X_['Passenger_id_in_group'])
    return X_

In [5]:
after_split_passenger_df = split_passenger(df)

## Passenger_group 이용한 결측치 대체

In [6]:
def fill_missing_values_with_another_group_member(df):
    df_ = df.copy()
    fill_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
    for col in fill_cols:
        df_.sort_values(col, inplace=True)
        df_[col] = df_.groupby(['Passenger_group'])[col].ffill()
    return df_

In [7]:
df = fill_missing_values_with_another_group_member(after_split_passenger_df)

## Cabin

In [8]:
def split_cabin(X):
    X_ = X.copy()
    X_[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = X_['Cabin'].str.split('/', 2, expand=True)
    X_['Cabin_num'] = pd.to_numeric(X_['Cabin_num'])
    X_ = X_.drop('Cabin', axis = 1)
    return X_

Cabin 칼럼을 각 요소에 따라 분리

In [9]:
df = split_cabin(df)

## Cabin_num binning

In [10]:
from optbinning import OptimalBinning
variable = "Cabin_num"
optb = OptimalBinning(name=variable, dtype="numerical", solver="cp")
x = df["Cabin_num"].values
y = df["Transported"].values
optb.fit(x, y)

OptimalBinning(name='Cabin_num')

In [11]:
x_transform_indices = optb.transform(x, metric="indices")
df["Cabin_num_bin"] = x_transform_indices

# Logistic regression

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [13]:
y = df['Transported']
drop_columns = ['Name', 'Cabin_num', 'PassengerId', 'Passenger_group', 'Passenger_id_in_group']
X = df.drop(['Transported'] + drop_columns, axis = 1)

In [14]:
numerical_features = [3, 5, 6, 7, 8, 9]
multicategorical_features = [0, 2, 10, 11, 12]
categorical_features = [1, 4]

multicategorical_preprocessing = Pipeline(
    [
        ('cat', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ])
categorical_preprocessing = Pipeline(
    [
        ('cat', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ])
numerical_preprocessing = Pipeline(
    [
        ('scaler', MinMaxScaler()),
        ('impute', SimpleImputer(strategy="median")),
    ])

preprocessing = ColumnTransformer(
                    [
                        ('multicatecorical', multicategorical_preprocessing,
                         multicategorical_features),
                        ('catecorical', categorical_preprocessing,
                         categorical_features),
                        ('numerical', numerical_preprocessing,
                         numerical_features),
                    ])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)

In [16]:
lor = LogisticRegression(C = 150, max_iter=300)

In [17]:
clf = Pipeline([("preprocessing", preprocessing),
                ("lor", lor)])

In [18]:
a = clf.fit(X_train, y_train)

In [19]:
clf.score(X_train, y_train)

0.7929130234698574

In [20]:
clf.score(X_test, y_test)

0.795768169273229

In [21]:
a = clf.fit(X, y)

In [22]:
test_df = pd.read_csv("test.csv")
test_df = split_passenger(test_df)

In [23]:
test_ids = test_df["PassengerId"]

In [24]:
concat_df = pd.concat([test_df, after_split_passenger_df])
filled_concat_df = fill_missing_values_with_another_group_member(concat_df)
test_df = filled_concat_df[filled_concat_df["PassengerId"].isin(test_ids)]

In [25]:
test_df_cp = test_df.copy()
test_df_cp = split_cabin(test_df_cp)
test_df_cp['Cabin_num_bin'] = pd.cut(test_df_cp['Cabin_num'], bins=optb.splits)
test_X = test_df_cp.drop(drop_columns, axis = 1)

In [26]:
predicted = clf.predict(test_X)

In [27]:
d = {"PassengerId" : test_df["PassengerId"], "Transported" : predicted}

In [28]:
export_df = pd.DataFrame(d)

In [29]:
#export_df.to_csv("submission.csv", index = False)