In [57]:
import pandas as pd

X_test = pd.read_csv("data/X_test.csv")
X_train = pd.read_csv("data/X_train.csv")
y_train = pd.read_csv("data/y_train.csv")

print(X_train.shape, X_test.shape, y_train.shape)
# 데이터 타입 확인
print(X_train.info())
print(X_test.info())
print(y_train.info())

# 결측치 확인
print(X_train.isnull().sum())
print(X_test.isnull().sum())
print(y_train.isnull().sum())

# 레이블(타겟) 확인
print(y_train['Reached.on.Time_Y.N'].value_counts())

n_X_train = X_train.select_dtypes(exclude = 'object').copy()
c_X_train = X_train.select_dtypes(include = 'object').copy()
n_X_test = X_test.select_dtypes(exclude = 'object').copy()
c_X_test = X_test.select_dtypes(include = 'object').copy()

# 수치형, 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols = ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms']
n_X_train[cols] = scaler.fit_transform(n_X_train[cols])
n_X_test[cols] = scaler.transform(n_X_test[cols])

# 범주형
c_X_train = pd.get_dummies(c_X_train)
c_X_test = pd.get_dummies(c_X_test)

# 분리한 데이터 합침
X_train = pd.concat([n_X_train, c_X_train], axis = 1)
X_test = pd.concat([n_X_test, c_X_test], axis = 1)
y_train = y_train['Reached.on.Time_Y.N']

# 검증 데이터 분리
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

# 모델링
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# 검증 평가
model1 = LogisticRegression()
model1.fit(X_tr, y_tr)
print('LR model score : ', model1.score(X_val, y_val))

model2 = KNeighborsClassifier()
model2.fit(X_tr, y_tr)
print('KNN model score : ', model2.score(X_val, y_val))

model3 = RandomForestClassifier(n_estimators=100)
model3.fit(X_tr, y_tr)
print('RF model score : ', model3.score(X_val, y_val))

model4 = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
model4.fit(X_tr, y_tr)
print('XGB model score : ', model4.score(X_val, y_val))

# roc-auc
from sklearn.metrics import roc_auc_score
pred = model3.predict(X_val)
print('roc_auc_score : ', roc_auc_score(y_val, pred))

# 정시 도착 여부(0)
pred_test = model3.predict_proba(X_test)
pred_test_prob = pd.DataFrame(pred_test[:,0], columns = ['gender'])
answer = pd.concat([X_test['ID'], pred_test_prob], axis = 1)

#answer.to_csv("test2.csv", index = False)

(8799, 11) (2200, 11) (8799, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8799 entries, 0 to 8798
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   8799 non-null   int64 
 1   Warehouse_block      8799 non-null   object
 2   Mode_of_Shipment     8799 non-null   object
 3   Customer_care_calls  8799 non-null   int64 
 4   Customer_rating      8799 non-null   int64 
 5   Cost_of_the_Product  8799 non-null   int64 
 6   Prior_purchases      8799 non-null   int64 
 7   Product_importance   8799 non-null   object
 8   Gender               8799 non-null   object
 9   Discount_offered     8799 non-null   int64 
 10  Weight_in_gms        8799 non-null   int64 
dtypes: int64(7), object(4)
memory usage: 756.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------

In [32]:
pred_test_prob = pd.DataFrame(pred_test[:,0], columns = ['gender'])
pred_test_prob

Unnamed: 0,gender
0,0.56
1,0.01
2,0.39
3,0.06
4,0.42
...,...
2195,0.29
2196,0.65
2197,0.43
2198,0.38


In [35]:
pd.concat([X_test['ID'], pred_test_prob], axis = 1)

Unnamed: 0,ID,gender
0,8800,0.56
1,8801,0.01
2,8802,0.39
3,8803,0.06
4,8804,0.42
...,...,...
2195,10995,0.29
2196,10996,0.65
2197,10997,0.43
2198,10998,0.38
