In [20]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

train_df = pd.read_csv('E:\data\kaggle_tabular/train.csv')
test_df = pd.read_csv('E:\data\kaggle_tabular/test.csv')
submission = pd.read_csv('E:\data\kaggle_tabular/sample_submission.csv')

https://www.kaggle.com/hiro5299834/tps-apr-2021-pseudo-labeling-voting-ensemble

앞으로 해볼 것    
K-fold    
GridSearchCV    
best_columns를 구할 수 있는 게 있었던 거 같은데 - from sklearn.feature_selection import SelectFromModel    

# data 전처리

* Pclass - a proxy for socio-economic status (SES) where 1st = Upper, 2nd = Middle and 3rd = Lower.
* Sex - male and female.
* Age - fractional if it less than 1 and age estimation in the form of xx.5.
* SibSp - number of siblings / spouses aboard the Synthanic; siblings are brother, sister, stepbrother and stepsister and spouses are husband and wife (mistresses and fiancés were ignored).
* Parch - # of parents / children aboard the Synthanic; parents are mother and father; child are daughter, son, stepdaughter and stepson. Some children travelled only with a nanny, therefore Parch is 0 for them.
* Fare - the paassenger fare.
* Cabin - the cabin number.
* Emarked - port of embarkation where C is Cherbourg, Q is Queenstown and S is Southampton.
* Ticket - ticket number.
* Name - passengers name.
* Survived - target variable where 0 is not survived and 1 is survived.

* 결측값 채우기

In [21]:
print('Missing values per columns in train dataset')
for col in train_df.columns:
    temp_col = train_df[col].isnull().sum()
    print(f'{col}: {temp_col}')

Missing values per columns in train dataset
PassengerId: 0
Survived: 0
Pclass: 0
Name: 0
Sex: 0
Age: 3292
SibSp: 0
Parch: 0
Ticket: 4623
Fare: 134
Cabin: 67866
Embarked: 250


In [22]:
# col = 'Age' 평균 나이로 결측값 채우기
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())

In [23]:
# col = 'Ticket', 결측값 'X'로 채우기
train_df['Ticket'] = train_df['Ticket'].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
test_df['Ticket'] = test_df['Ticket'].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
train_df['Ticket'][:100]

0      X
1      X
2     CA
3     A.
4      X
      ..
95     X
96     X
97    PC
98     X
99     X
Name: Ticket, Length: 100, dtype: object

In [24]:
# col = 'Cabin', 결측값 'X'로 채우기
train_df['Cabin'] = train_df['Cabin'].fillna('X').map(lambda x: x[0].strip())
test_df['Cabin'] = test_df['Cabin'].fillna('X').map(lambda x: x[0].strip())
train_df['Cabin'][:100]

0     C
1     X
2     X
3     X
4     X
     ..
95    X
96    X
97    D
98    X
99    C
Name: Cabin, Length: 100, dtype: object

In [25]:
# col = 'Embarked', 결측값 'X'로 채우기
train_df['Embarked'] = train_df['Embarked'].fillna('X')
test_df['Embarked'] = test_df['Embarked'].fillna('X')
train_df['Embarked'][:100]

0     S
1     S
2     S
3     S
4     S
     ..
95    S
96    Q
97    Q
98    S
99    C
Name: Embarked, Length: 100, dtype: object

In [26]:
# col = 'Fare', 결측값 각 Pclass 마다 fare 중앙값을 넣는다.
fare_map =  train_df[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
# fare_map    # {'Fare': {1: 63.58, 2: 22.72, 3: 10.96}}
train_df['Fare'] = train_df['Fare'].fillna(train_df['Pclass'].map(fare_map['Fare']))
train_df['Fare'] = np.log1p(train_df['Fare'])
train_df['Fare'][:100]

0     3.337192
1     2.663750
2     4.280686
3     2.641910
4     2.170196
        ...   
95    3.503754
96    3.312730
97    4.884921
98    3.363149
99    5.633217
Name: Fare, Length: 100, dtype: float64

In [27]:
# col = 'Fare', 결측값 각 Pclass 마다 fare 중앙값을 넣는다.
fare_map =  test_df[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
# fare_map    # {'Fare': {1: 80.68, 2: 14.455, 3: 11.52}}
test_df['Fare'] = test_df['Fare'].fillna(test_df['Pclass'].map(fare_map['Fare']))
test_df['Fare'] = np.log1p(test_df['Fare'])
test_df['Fare'][:100]

0     4.159039
1     1.918392
2     3.686627
3     2.634045
4     3.328268
        ...   
95    2.249184
96    2.621766
97    1.969906
98    2.536866
99    3.469479
Name: Fare, Length: 100, dtype: float64

* 앞 이름만 가져오기

In [28]:
train_df['Name'] = train_df['Name'].map(lambda x: x.split(',')[0])
test_df['Name'] = test_df['Name'].map(lambda x: x.split(',')[0])
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,Oconnor,male,38.355472,2,0,X,3.337192,C,S
1,1,0,3,Bryan,male,38.355472,0,0,X,2.663750,X,S
2,2,0,3,Owens,male,0.330000,1,2,CA,4.280686,X,S
3,3,0,3,Kramer,male,19.000000,0,0,A.,2.641910,X,S
4,4,1,3,Bond,male,25.000000,0,0,X,2.170196,X,S
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,1,2,Bell,female,62.000000,0,0,PC,2.763800,D,C
99996,99996,0,2,Brown,male,66.000000,0,0,X,2.497329,X,S
99997,99997,0,3,Childress,male,37.000000,0,0,X,2.393339,X,S
99998,99998,0,3,Caughlin,male,51.000000,0,1,X,3.463233,X,S


* 'Survived' 컬럼을 맨 뒤로 보내기

In [29]:
train_df['y_target'] = train_df['Survived']
train_df = train_df.drop(['Survived'], axis=1)
train_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,y_target
0,0,1,Oconnor,male,38.355472,2,0,X,3.337192,C,S,1
1,1,3,Bryan,male,38.355472,0,0,X,2.663750,X,S,0
2,2,3,Owens,male,0.330000,1,2,CA,4.280686,X,S,0
3,3,3,Kramer,male,19.000000,0,0,A.,2.641910,X,S,0
4,4,3,Bond,male,25.000000,0,0,X,2.170196,X,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,2,Bell,female,62.000000,0,0,PC,2.763800,D,C,1
99996,99996,2,Brown,male,66.000000,0,0,X,2.497329,X,S,0
99997,99997,3,Childress,male,37.000000,0,0,X,2.393339,X,S,0
99998,99998,3,Caughlin,male,51.000000,0,1,X,3.463233,X,S,0


* 문자들을 숫자로 치환하기

In [30]:
def label_encoder(c):
    encoder = LabelEncoder()
    return encoder.fit_transform(c)

labels_col = ['Sex','Ticket']
label_encoded_df = train_df[labels_col].apply(label_encoder)
test_label_encoded_df = test_df[labels_col].apply(label_encoder)
# label_encoded_df

onehot_cols = ['Cabin', 'Embarked']
onehot_encoded_df = pd.get_dummies(train_df[onehot_cols])
test_onehot_encoded_df = pd.get_dummies(test_df[onehot_cols])
# onehot_encoded_df

numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare']
scaler = MinMaxScaler()
numerical_df = pd.DataFrame(scaler.fit_transform(train_df[numerical_cols]), columns=numerical_cols)
test_numerical_df = pd.DataFrame(scaler.fit_transform(test_df[numerical_cols]), columns=numerical_cols)
# numerical_df

etc_df = train_df['Pclass']
test_etc_df = test_df['Pclass']

target_df = train_df['y_target']


train_df = pd.concat([etc_df, numerical_df, label_encoded_df, onehot_encoded_df, target_df], axis=1)
test_df = pd.concat([test_etc_df, test_numerical_df, test_label_encoded_df, test_onehot_encoded_df], axis=1)

최종적으로 사용할 데이터 셋

In [31]:
train_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Ticket,Cabin_A,Cabin_B,Cabin_C,...,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_X,Embarked_C,Embarked_Q,Embarked_S,Embarked_X,y_target
0,1,0.440353,0.250,0.000000,0.462375,1,49,0,0,1,...,0,0,0,0,0,0,0,1,0,1
1,3,0.440353,0.000,0.000000,0.351893,1,49,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,3,0.002876,0.125,0.222222,0.617161,1,14,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,3,0.217671,0.000,0.000000,0.348310,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,3,0.286700,0.000,0.000000,0.270923,1,49,0,0,0,...,0,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2,0.712379,0.000,0.000000,0.368307,0,21,0,0,0,...,0,0,0,0,0,1,0,0,0,1
99996,2,0.758399,0.000,0.000000,0.324591,1,49,0,0,0,...,0,0,0,0,1,0,0,1,0,0
99997,3,0.424758,0.000,0.000000,0.307531,1,49,0,0,0,...,0,0,0,0,1,0,0,1,0,0
99998,3,0.585826,0.000,0.111111,0.483053,1,49,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [32]:
test_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Ticket,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_X,Embarked_C,Embarked_Q,Embarked_S,Embarked_X
0,3,0.233811,0.000,0.000000,0.634709,1,49,0,0,0,0,0,0,0,0,1,0,0,1,0
1,3,0.653979,0.000,0.000000,0.288706,0,49,0,0,0,0,0,0,0,0,1,0,0,1,0
2,1,0.233811,0.000,0.000000,0.561759,0,49,0,1,0,0,0,0,0,0,0,1,0,0,0
3,2,0.307958,0.000,0.000000,0.399218,1,49,0,0,0,0,0,0,0,0,1,0,0,1,0
4,1,0.209095,0.000,0.222222,0.506421,0,49,0,1,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3,0.332674,0.000,0.000000,0.364427,0,49,0,0,0,0,0,0,0,0,1,0,1,0,0
99996,1,0.728127,0.125,0.000000,0.646993,1,49,0,0,0,0,0,0,0,0,1,0,0,1,0
99997,3,0.579832,0.000,0.000000,0.374506,1,49,0,0,0,0,0,0,0,0,1,0,0,1,0
99998,1,0.604548,0.125,0.222222,0.521143,0,21,0,1,0,0,0,0,0,0,0,1,0,0,0


# 1. DATA

In [33]:
N_ESTIMATORS = 1000
N_SPLITS = 10
SEED = 2021
EARLY_STOPPING_ROUNDS = 20
VERBOSE = 1

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

In [34]:
# 훈련 데이터 셋
x = train_df.drop(['y_target'], axis=1)
y = train_df['y_target']
x = x.to_numpy()
y = y.to_numpy()
print(x.shape, y.shape)

(100000, 20) (100000,)


In [35]:
# 예측 데이터 셋
x_pred = test_df
x_pred = x_pred.to_numpy()
print(x_pred.shape)

(100000, 20)


In [36]:
# x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=42)
# print(x_train.shape, x_val.shape)   # (80000, 20) (20000, 20)
# print(y_train.shape, y_val.shape)   # (80000,) (20000,)

# 2. Model

In [38]:
import lightgbm as lgb

params = {
    'metric': 'binary_logloss',
    'n_estimators': N_ESTIMATORS,
    'objective': 'binary',
    'random_state': SEED,
    'learning_rate': 0.01,
    'min_child_samples': 150,
    'reg_alpha': 3e-5,
    'reg_lambda': 9e-2,
    'num_leaves': 20,
    'max_depth': 16,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'subsample_freq': 2,
    'max_bin': 240,
}

#KFold
for fold, (train_idx, valid_idx) in enumerate(skf.split(x, y)) :
    print(f"=====Fold {fold}=====")
    x_train = x[train_idx]
    x_val = x[valid_idx]
    y_train = y[train_idx]
    y_val = y[valid_idx]  
    # print(x_train.shape, x_val.shape)  # (90000, 20) (10000, 20)
    # print(y_train.shape, y_val.shape)  # (90000,) (10000,)
    

    model = lgb.LGBMClassifier(**params)
    model.fit(x_train, y_train,
            eval_set=[(x_val, y_val)],
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            verbose=VERBOSE)

    y_val_pred = model.predict(x_val)
    acc_score = accuracy_score(y_val, y_val_pred)
    print(f"===== ACCURACY SCORE {acc_score:.6f} =====")    # 0.776500 

acc_score = accuracy_score(y_val, y_val_pred)
print(f"===== ACCURACY SCORE {acc_score:.6f} =====")    # 0.776500 

ogloss: 0.474263
[436]	valid_0's binary_logloss: 0.474252
[437]	valid_0's binary_logloss: 0.474222
[438]	valid_0's binary_logloss: 0.474203
[439]	valid_0's binary_logloss: 0.474197
[440]	valid_0's binary_logloss: 0.474177
[441]	valid_0's binary_logloss: 0.474157
[442]	valid_0's binary_logloss: 0.474142
[443]	valid_0's binary_logloss: 0.474132
[444]	valid_0's binary_logloss: 0.474119
[445]	valid_0's binary_logloss: 0.474104
[446]	valid_0's binary_logloss: 0.474081
[447]	valid_0's binary_logloss: 0.474073
[448]	valid_0's binary_logloss: 0.474061
[449]	valid_0's binary_logloss: 0.474045
[450]	valid_0's binary_logloss: 0.474031
[451]	valid_0's binary_logloss: 0.474021
[452]	valid_0's binary_logloss: 0.47401
[453]	valid_0's binary_logloss: 0.474004
[454]	valid_0's binary_logloss: 0.473987
[455]	valid_0's binary_logloss: 0.473967
[456]	valid_0's binary_logloss: 0.473949
[457]	valid_0's binary_logloss: 0.473937
[458]	valid_0's binary_logloss: 0.473921
[459]	valid_0's binary_logloss: 0.473903


# 3. Predict

In [39]:
y_pred = model.predict(x_pred)
y_pred.shape

(100000,)

In [41]:
submission['Survived'] = y_pred
submission

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1
3,100003,0
4,100004,1
...,...,...
99995,199995,1
99996,199996,0
99997,199997,0
99998,199998,1


# 4.Submission

In [42]:
submission.to_csv("E:\\data\\kaggle_tabular\\submission_0425_lgbm.csv", index = False)
submission

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1
3,100003,0
4,100004,1
...,...,...
99995,199995,1
99996,199996,0
99997,199997,0
99998,199998,1


---
파일명 : submission_0425_lgbm.csv     
score : 0.80112