In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("train.csv")

# Preprocessing

In [3]:
def plot_cont(colname, except_zero=False):
    fig, ax = plt.subplots(1, 1, figsize=(10,5))
    if except_zero:
        sns.kdeplot(df[(df.Transported == True) & (df[colname] > 0)][colname], ax=ax)
        sns.kdeplot(df[(df.Transported == False) & (df[colname] > 0)][colname], ax=ax)
    else:
        sns.kdeplot(x=df[df.Transported == True][colname], ax=ax)
        sns.kdeplot(x=df[df.Transported == False][colname], ax=ax)
    plt.legend(['Transported', 'Not Transported'])
    plt.show()

## Age

나이 결측치 Median으로 대체

In [4]:
df['Age'] = df['Age'].fillna(df['Age'].median())

## Cabin

In [5]:
def split_cabin(X):
    X_ = X.copy()
    X_[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = X_['Cabin'].str.split('/', 2, expand=True)
    X_['Cabin_num'] = pd.to_numeric(X_['Cabin_num'])
    X_ = X_.drop('Cabin', axis = 1)
    return X_

Cabin 칼럼을 각 요소에 따라 분리

In [6]:
df = split_cabin(df)

## PassengerID

Passenger ID를 group과 group 내 ID로 분리

In [7]:
def split_passenger(X):
    X_ = X.copy()
    X_[['Passenger_group', 'Passenger_id_in_group']] = X_['PassengerId'].str.split('_', 1, expand=True)
    X_['Passenger_group'] = pd.to_numeric(X_['Passenger_group'])
    X_['Passenger_id_in_group'] = pd.to_numeric(X_['Passenger_id_in_group'])
    X_ = X_.drop('PassengerId', axis = 1)
    return X_

In [8]:
df = split_passenger(df)

# Logistic regression

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [10]:
y = df['Transported']
drop_columns = ['Name', 'Cabin_num', 'Passenger_group', 'Passenger_id_in_group']
X = df.drop(['Transported'] + drop_columns, axis = 1)

In [11]:
numerical_features = [3, 5, 6, 7, 8, 9]
multicategorical_features = [0, 2, 10, 11]
categorical_features = [1, 4]

multicategorical_preprocessing = Pipeline(
    [
        ('cat', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ])
categorical_preprocessing = Pipeline(
    [
        ('cat', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ])
numerical_preprocessing = Pipeline(
    [
        ('scaler', MinMaxScaler()),
        ('impute', SimpleImputer(strategy="median")),
    ])

preprocessing = ColumnTransformer(
                    [
                        ('multicatecorical', multicategorical_preprocessing,
                         multicategorical_features),
                        ('catecorical', categorical_preprocessing,
                         categorical_features),
                        ('numerical', numerical_preprocessing,
                         numerical_features),
                    ])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)

In [13]:
lor = LogisticRegression(C = 50, max_iter=300)

In [14]:
clf = Pipeline([("preprocessing", preprocessing),
                ("lor", lor)])

In [15]:
a = clf.fit(X_train, y_train)

In [16]:
clf.score(X_train, y_train)

0.7933732167510354

In [17]:
clf.score(X_test, y_test)

0.781508739650414

In [18]:
test_df = pd.read_csv("test.csv")

In [19]:
test_df_cp = test_df.copy()
test_df_cp = split_cabin(test_df_cp)
test_df_cp = split_passenger(test_df_cp)
test_X = test_df_cp.drop(drop_columns, axis = 1)

In [20]:
predicted = clf.predict(test_X)

In [21]:
d = {"PassengerId" : test_df["PassengerId"], "Transported" : predicted}

In [22]:
export_df = pd.DataFrame(d)

In [23]:
# export_df.to_csv("submission.csv", index = False)