## 2.1 2회차 기출 유형

https://www.kaggle.com/code/agileteam/tutorial-t2-2-python/data

### 전자 상거래 배송 데이터 (Train.csv)

- 제품 배송 시간에 맞춰 배송되었는지 예측모델 만들기
- 학습용 데이터 (X_train, y_train)을 이용하여 배송 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점)

- 성능이 우수한 예측모형을 구축하기 위해서는 적절한 데이터 전처리, 피처엔지니어링, 분류알고리즘, 하이퍼파라미터 튜닝, 모형 앙상블 등이 수반되어야 한다.
- 수험번호.csv파일이 만들어지도록 코드를 제출한다.
- 제출한 모델의 성능은 ROC-AUC형태로 읽어드린다.

In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8799, 11), (2200, 11), (8799, 2), (2200, 2))

In [2]:
X_train.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
3999,4000,B,Ship,3,4,134,3,high,F,6,5680
9559,9560,F,Ship,4,3,173,3,medium,M,5,5331
2649,2650,B,Ship,2,1,192,3,high,M,46,3206
4843,4844,F,Ship,6,5,284,4,medium,M,8,5346
9601,9602,F,Flight,3,1,246,3,low,F,10,4707


In [3]:
y_train.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
3999,4000,0
9559,9560,1
2649,2650,1
4843,4844,1
9601,9602,1


In [20]:
X_train.describe()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
count,8799.0,8799.0,8799.0,8799.0,8799.0,8799.0,8799.0,8799.0,8799.0,8799.0
mean,2.327878,1.50733,4.050006,2.993295,210.087169,3.564723,1.342539,0.493579,13.510399,3616.20366
std,1.49581,0.761162,1.140903,1.411744,47.937049,1.520299,0.63249,0.499987,16.305299,1635.051213
min,0.0,0.0,2.0,1.0,96.0,2.0,0.0,0.0,1.0,1001.0
25%,1.0,1.0,3.0,2.0,169.0,3.0,1.0,0.0,4.0,1828.0
50%,2.0,2.0,4.0,3.0,214.0,3.0,1.0,0.0,7.0,4133.0
75%,4.0,2.0,5.0,4.0,250.0,4.0,2.0,1.0,10.0,5034.5
max,4.0,2.0,7.0,5.0,310.0,10.0,2.0,1.0,65.0,7846.0


In [4]:
X_train.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8799 entries, 3999 to 9332
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   8799 non-null   int64 
 1   Warehouse_block      8799 non-null   object
 2   Mode_of_Shipment     8799 non-null   object
 3   Customer_care_calls  8799 non-null   int64 
 4   Customer_rating      8799 non-null   int64 
 5   Cost_of_the_Product  8799 non-null   int64 
 6   Prior_purchases      8799 non-null   int64 
 7   Product_importance   8799 non-null   object
 8   Gender               8799 non-null   object
 9   Discount_offered     8799 non-null   int64 
 10  Weight_in_gms        8799 non-null   int64 
dtypes: int64(7), object(4)
memory usage: 824.9+ KB


In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

encoder.fit(X_train['Warehouse_block'])
X_train['Warehouse_block'] = encoder.transform(X_train['Warehouse_block'])
X_test['Warehouse_block'] = encoder.transform(X_test['Warehouse_block'])

encoder.fit(X_train['Mode_of_Shipment'])
X_train['Mode_of_Shipment'] = encoder.transform(X_train['Mode_of_Shipment'])
X_test['Mode_of_Shipment'] = encoder.transform(X_test['Mode_of_Shipment'])

encoder.fit(X_train['Product_importance'])
X_train['Product_importance'] = encoder.transform(X_train['Product_importance'])
X_test['Product_importance'] = encoder.transform(X_test['Product_importance'])

encoder.fit(X_train['Gender'])
X_train['Gender'] = encoder.transform(X_train['Gender'])
X_test['Gender'] = encoder.transform(X_test['Gender'])

In [7]:
X_train.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
3999,4000,1,2,3,4,134,3,0,0,6,5680
9559,9560,4,2,4,3,173,3,2,1,5,5331
2649,2650,1,2,2,1,192,3,0,1,46,3206
4843,4844,4,2,6,5,284,4,2,1,8,5346
9601,9602,4,0,3,1,246,3,1,0,10,4707


In [8]:
X_train_id = X_train.pop('ID')
X_test_id = X_test.pop('ID')

In [9]:
from sklearn.model_selection import train_test_split
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, stratify=y_train['Reached.on.Time_Y.N'], random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_tra, y_tra['Reached.on.Time_Y.N'])

RandomForestClassifier()

In [11]:
pred_val = model.predict(X_val)

In [12]:
from sklearn.metrics import roc_auc_score
roc_auc_score(pred_val, y_val['Reached.on.Time_Y.N'])

0.6620972300734879

In [None]:
model.fit(X_train, y_train['Reached.on.Time_Y.N'])
pred_train = model.predict(X_train)
roc_auc_score(pred_train, y_train['Reached.on.Time_Y.N'])

In [None]:
pred = model.predict(X_test)

In [None]:
pred = pd.DataFrame({'ID': X_test_id, 'Reached.on.Time_Y.N': pred})

In [None]:
pred.to_csv('submission.csv', index=False)

## 2.2 공식 예제

https://www.kaggle.com/code/agileteam/t2-exercise-tutorial-baseline/notebook

### 백화점 고객의 1년간 구매 데이터

- 고객 3,500명에 대한 학습용 데이터를 이용하여 성별예측 모형을 만든 후,
- 이를 평가용 데이터에 적용하여 얻은 2,482명 고객의 성별 예측값(남자일 확률)을 다음과 같은 형식의 CSV 파일로 생성하시오.
- 0:여자, 1:남자

In [None]:
import pandas as pd
X_train = pd.read_csv("data/X_train.csv", encoding="euc-kr") # 구름 IDE환경에서는 encoding="euc-kr"가 없어도 됨
X_test = pd.read_csv("data/X_test.csv", encoding="euc-kr")
y_train = pd.read_csv("data/y_train.csv", encoding="euc-kr")

X_train.head()

In [None]:
y_train.head()

In [None]:
X_train.isnull().sum()

In [None]:
X_train['환불금액'] = X_train['환불금액'].fillna(0)
X_test['환불금액'] = X_test['환불금액'].fillna(0)

In [None]:
X_train.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

encoder.fit(X_train['주구매상품'])
X_train['주구매상품'] = encoder.transform(X_train['주구매상품'])
X_test['주구매상품'] = encoder.transform(X_test['주구매상품'])

encoder.fit(X_train['주구매지점'])
X_train['주구매지점'] = encoder.transform(X_train['주구매지점'])
X_test['주구매지점'] = encoder.transform(X_test['주구매지점'])

In [None]:
X_train_id = X_train.pop('cust_id')
X_test_id = X_test.pop('cust_id')

In [None]:
from sklearn.model_selection import train_test_split
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train['gender'], stratify=y_train['gender'], random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=5)
model.fit(X_tra, y_tra)

In [None]:
pred_val = model.predict(X_val)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(pred_val, y_val)

In [None]:
model.fit(X_train, y_train['gender'])
pred_train = model.predict(X_train)
roc_auc_score(pred_train, y_train['gender'])

In [None]:
pred = model.predict(X_test)
submission = pd.DataFrame({'cust_id' : X_test_id, 'gender':pred})
submission.to_csv('data/submission.csv', index=False)

## 3회차 기출 유형(1)

https://www.kaggle.com/code/agileteam/3rd-type2-3-2-baseline/notebook

- 여행 보험 패키지 상품을 구매할 확률 값을 구하시오
- 예측할 값(y): TravelInsurance (여행보험 패지지를 구매 했는지 여부 0:구매안함, 1:구매)
- 평가: roc-auc 평가지표
- data: t2-1-train.csv, t2-1-test.csv
- 제출형식
    - id,TravelInsurance

In [None]:
import pandas as pd
X_train = pd.read_csv('t2-1-train.csv')
X_test = pd.read_csv('t2-1-test.csv')

## 3회차 기출 유형(2)

https://www.kaggle.com/code/agileteam/t2-2-2-baseline-r2

- 대학원 입학 예측(회귀)
- 예측할 값(target): "Chance of Admit "
- 평가: r2
- data(3개): t2-2-X_train, t2-2-y_train, t2-2-X_test
- 제출 형식(Serial No.-> id, 예측 값 -> target)
- id,target

In [None]:
import pandas as pd
X_train = pd.read_csv('t2-2-X_train.csv')
X_test = pd.read_csv('t2-2-X_test.csv')
y_train = pd.read_csv('t2-2-y_train.csv')

X_train.head()

## T2-1. 타이타닉(Titanic) Simple Baseline

https://www.kaggle.com/code/agileteam/t2-1-titanic-simple-baseline/notebook

- 학습용 데이터 (X_train, y_train)을 이용하여 생존 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 accuracy 평가지표에 따라 채점)

In [None]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("titanic/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Survived', id_name='PassengerId')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_train.info()

In [None]:
X_train = X_train[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
X_test = X_test[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
X_train.info()

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

In [None]:
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].median())
X_test['Age'] = X_test['Age'].fillna(X_train['Age'].median())

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
X_train['Pclass'] = encoder.fit_transform(X_train['Pclass'])
X_train['Sex'] = encoder.fit_transform(X_train['Sex'])

X_test['Pclass'] = encoder.fit_transform(X_test['Pclass'])
X_test['Sex'] = encoder.fit_transform(X_test['Sex'])

In [None]:
X_train_id = X_train.pop('PassengerId')
X_test_id = X_test.pop('PassengerId')

In [None]:
from sklearn.model_selection import train_test_split
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train[['Survived']], stratify=y_train[['Survived']], random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_tra, y_tra)

In [None]:
model.score(X_tra, y_tra)

In [None]:
model.score(X_val, y_val)

In [None]:
pred_val = model.predict(X_val)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(pred_val, y_val)

In [None]:
model.fit(X_train, y_train['Survived'])
model.score(X_train, y_train['Survived'])

In [None]:
pred = model.predict(X_test)
pred

In [None]:
submission = pd.DataFrame({'PassengerId': X_test_id, 'Survived': pred})
submission.head()

In [None]:
submission.to_csv('titanic/submission.csv', index=False)

## T2-5. Insurance Forecast

https://www.kaggle.com/code/agileteam/insurance-starter-tutorial/notebook

In [None]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("insurance.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='charges')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_train.info()

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

In [None]:
X_train = pd.get_dummies(X_train)
X_train

In [None]:
X_test = pd.get_dummies(X_test)

In [None]:
X_train_id = X_train.pop('id')
X_test_id = X_test.pop('id')

In [None]:
X_train.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train['charges'])

In [None]:
model.score(X_train, y_train['charges'])

In [None]:
pred_train = model.predict(X_train)

In [None]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(pred_train, y_train['charges']))

In [None]:
pred_test = model.predict(X_test)
np.sqrt(mean_squared_error(pred_test, y_test['charges']))

In [None]:
sub = pd.DataFrame({'id': X_test_id, 'charges': pred_test})
sub.to_csv('submission_insurance.csv', index=False)