# 고객별 백화점 구매기록 데이터셋

- 모델의 목표 : 개별 고객이 앞으로 매장에서 **다시 구매를 할지 여부**를 예측하는 것입니다.

**'고객이 구매를 할지 안 할지'** 즉, **재방문하여 구매할 가능성이 있는지**를 예측

모델이 예측하는 결과는 보통 "구매한다(1)" 또는 "구매하지 않는다(0)"의 두 가지 가능성 중 하나

## 데이터 링크

- 미혼(1) or 기혼(0) 고객을 예측
- 학습용 구매기록 데이터
    - https://drive.google.com/file/d/1tg41qXu02FK55bFa8P1Gx2URhwPUCoQQ/view?usp=sharing
- 학습용 정답 데이터
    - https://drive.google.com/file/d/1-9AcU9nAoO4SzSmqdCOYTvWNpA5Pdt3q/view?usp=sharing
- 테스트용 구매기록 데이터
    - https://drive.google.com/file/d/1-AwDfGlHm9rNtpnHIWOK96jBJYy3f2SZ/view?usp=sharing
- 제출 양식 데이터
    - https://drive.google.com/file/d/1-Qv7SlsY5Eu3bRR7Z0IUJyLkOO1Fl3y6/view?usp=sharing

## 데이터 학습

- pivot table 이용하자!!!!! 정확도 오른다!!!

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- 데이터 경로 변수

In [2]:
DATA_PATH = "/content/drive/MyDrive/멋쟁이사자차럼/data/"
DATA_PATH

'/content/drive/MyDrive/멋쟁이사자차럼/data/'

- 데이터 불러오기

In [3]:
import pandas as pd
import numpy as np
train_tr = pd.read_csv("/content/drive/MyDrive/멋쟁이사자차럼/data/store_train_transactions.csv")
# train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

In [None]:
from IPython.display import display, HTML
br_html = HTML("<br><br>")
display(HTML("<h1>학습용 구매기록 데이터</h1>"), train_tr.head(), br_html)
display(HTML("<h1>학습용 정답 데이터</h1>"), train_target.head(), br_html)
display(HTML("<h1>테스트용 구매기록 데이터</h1>"), test_tr.head(), br_html)
display(HTML("<h1>제출 양식 데이터</h1>"), submit.head(), br_html)

In [None]:
display(HTML("<h1>학습용 구매기록 데이터</h1>"))
display(train_tr.info(), br_html)

display(HTML("<h1>학습용 정답 데이터</h1>"))
display(train_target.info(), br_html)

display(HTML("<h1>테스트용 구매기록 데이터</h1>"))
display(test_tr.info(), br_html)

display(HTML("<h1>제출 양식 데이터</h1>"))
display(submit.info(), br_html)

In [None]:
train_target["target"].mean()

In [None]:
train_tr["ID"].nunique() , train_target.shape

In [None]:
train_tr.sort_values("ID").head(3)

# 특성 공학(Feature Engineering)

## 날짜 형식으로 변환

In [None]:
train_tr["구매일시"] = pd.to_datetime(train_tr["구매일시"])
test_tr["구매일시"] = pd.to_datetime(test_tr["구매일시"])

In [None]:
# max 최근 날짜 min 오래된 날짜
time_delta = train_tr["구매일시"].max() - train_tr["구매일시"].min()
time_delta

In [None]:
time_delta.days # 일만 뽑기

In [None]:
time_delta.total_seconds() # 초만 뽑기

## 새로 만든 feature와 병합할 고객ID로만 이루어진 데이터프레임 생성

In [None]:
train_ft = train_target[["ID"]]
test_ft = submit[["ID"]]

train_ft.shape, test_ft.shape

## 구매일시를 이용한 특성생성

0~6 요일
series... 가 뭔지 일단 알아야 할듯 호호
왜도, 첨도... 구매금액으로

In [None]:
agg_list = [
        # 컬럼명, 집계 방식
        ('내점일수','nunique'), #  고쳐야 된다고 한다.....
        ('구매주기', lambda x: int( (x.max() - x.min()).days / x.dt.date.nunique()) ),
        ('주말방문비율', lambda x: np.mean(x.dt.weekday>4)),
        ('봄_구매비율', lambda x: np.mean(x.dt.month.isin([3,4,5]))),
        ('여름_구매비율', lambda x: np.mean(x.dt.month.isin([6,7,8]))),
        ('가을_구매비율', lambda x: np.mean(x.dt.month.isin([9,10,11]))),
        ('겨울_구매비율', lambda x: np.mean(x.dt.month.isin([1,2,12]))),
        ('주구매요일', lambda x: x.dt.weekday.mode()[0]),
        ('일별평균구매건수', lambda x:  x.count() / x.dt.date.nunique() ),
        ('거래개월수', lambda x: x.dt.date.astype(str).str[:-3].nunique() ),
    ]

tmp = train_tr.groupby('ID')["구매일시"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left',on="ID")
train_ft.head()

In [None]:
tmp = test_tr.groupby('ID')["구매일시"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

## 지점을 이용한 특성생성

In [None]:
agg_list = [
          ("방문지점수","nunique"),
          ('주구매지점', lambda x: x.mode()[0]),
    ]

tmp = train_tr.groupby('ID')["지점코드"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left',on="ID")
train_ft.head()

In [None]:
tmp = test_tr.groupby('ID')["지점코드"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

## 브랜드코드를 이용한 특성생성

In [None]:
agg_list = [
             ('브랜드코드_nunique', 'nunique'),
             ]

tmp = train_tr.groupby('ID')["브랜드코드"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left',on="ID")
train_ft.head()

In [None]:
tmp = test_tr.groupby('ID')["브랜드코드"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

## 중분류를 이용한 특성생성

In [None]:
agg_list = [
            ('중분류_nunique', 'nunique'),
            ('주구매_중분류', lambda x: x.mode()[0]),
            ]
tmp = train_tr.groupby('ID')["중분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

In [None]:
tmp = test_tr.groupby('ID')["중분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

## 대분류를 이용한 특성생성

In [None]:
agg_list = [
            ('대분류_nunique', 'nunique'),
            ('주구매_대분류', lambda x: x.mode()[0]),
            ]

tmp = train_tr.groupby('ID')["대분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

In [None]:
tmp = test_tr.groupby('ID')["대분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

## 구매가격을 이용한 특성생성

왜도, 첨도... 구매금액으로

In [None]:
agg_list = [
        ('총구매액','sum'),
        ('구매건수', 'count'),
        ('평균구매액', "mean"),
        ('최대구매액', 'max'),
        ('최소구매액',lambda x: x[x > 0].min() ) ,
        ('환불금액',lambda x: x[x < 0].sum() ) ,
        ('환불건수', lambda x: ( x < 0 ).sum() ),
        ('구매금액표준편차',lambda x: x[x>0].std() ),
    ]

tmp = train_tr.groupby('ID')["구매가격"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

In [None]:
tmp = test_tr.groupby('ID')["구매가격"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

## 강사님이 추가하신 피처 불러오기

In [11]:
import pandas as pd
import numpy as np

In [4]:
# train_ft.to_csv(f"{DATA_PATH}train_ft.csv",index= False)
# test_ft.to_csv(f"{DATA_PATH}test_ft.csv",index= False)

In [5]:
train_ft = pd.read_csv(f"{DATA_PATH}train_ft.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_ft.csv")

In [6]:
train_ft.head()
# test_ft.head()  # 5 rows × 26 columns

Unnamed: 0,ID,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,...,대분류_nunique,주구매_대분류,총구매액,구매건수,평균구매액,최대구매액,최소구매액,환불금액,환불건수,구매금액표준편차
0,train_0,16,23,0.25,0.05,0.25,0.4,0.3,3,1.818182,...,9,남성정장스포츠,4201200,20,210060.0,1236000,20000,-2517000,3,403660.245259
1,train_1,41,16,0.02381,0.357143,0.166667,0.357143,0.119048,3,2.0,...,12,영플라자,5043025,42,120072.02381,698000,5600,0,0,126592.626144
2,train_2,102,6,0.210526,0.464912,0.140351,0.175439,0.219298,0,2.035714,...,14,명품잡화,22135386,114,194170.052632,2770000,4400,-1072500,7,406609.265932
3,train_3,191,3,0.189573,0.379147,0.180095,0.236967,0.203791,3,2.293478,...,16,케주얼_구두_아동,35594762,211,168695.554502,1492000,10000,-2867800,14,236976.860907
4,train_4,55,11,0.258065,0.112903,0.612903,0.209677,0.064516,4,2.296296,...,8,명품잡화,3988866,62,64336.548387,1204000,7640,-6954400,13,308503.944915


In [7]:
 train_ft.columns

Index(['ID', '내점일수', '구매주기', '주말방문비율', '봄_구매비율', '여름_구매비율', '가을_구매비율',
       '겨울_구매비율', '주구매요일', '일별평균구매건수', '거래개월수', '방문지점수', '주구매지점',
       '브랜드코드_nunique', '중분류_nunique', '주구매_중분류', '대분류_nunique', '주구매_대분류',
       '총구매액', '구매건수', '평균구매액', '최대구매액', '최소구매액', '환불금액', '환불건수', '구매금액표준편차'],
      dtype='object')

## 내 피처 추가하기

In [8]:
# Pivot Table로 브랜드별 구매 건수 계산
pivot_brand_data = train_ft.pivot_table(index='ID',
                                        columns='주구매_중분류',
                                        values='총구매액',
                                        aggfunc='count',
                                        fill_value=0).reset_index()

# 피처명을 보기 좋게 변경 (예: 브랜드 5379는 '브랜드_5379_구매빈도'로 변경)
pivot_brand_data.columns = ['ID'] + ['주구매_중분류_총구매액' for col in pivot_brand_data.columns[1:]]

# 생성된 피처를 train_ft에 병합
train_ft = train_ft.merge(pivot_brand_data, how='left', on='ID')

# 테스트 데이터에서도 동일한 작업 수행
pivot_brand_data_test = test_ft.pivot_table(index='ID',
                                            columns='주구매_중분류',
                                            values='총구매액',
                                            aggfunc='count',
                                            fill_value=0).reset_index()

pivot_brand_data_test.columns = ['ID'] + ['주구매_중분류_총구매액' for col in pivot_brand_data_test.columns[1:]]
test_ft = test_ft.merge(pivot_brand_data_test, how='left', on='ID')

# 결과 확인
train_ft.head()

Unnamed: 0,ID,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,...,주구매_중분류_총구매액,주구매_중분류_총구매액.1,주구매_중분류_총구매액.2,주구매_중분류_총구매액.3,주구매_중분류_총구매액.4,주구매_중분류_총구매액.5,주구매_중분류_총구매액.6,주구매_중분류_총구매액.7,주구매_중분류_총구매액.8,주구매_중분류_총구매액.9
0,train_0,16,23,0.25,0.05,0.25,0.4,0.3,3,1.818182,...,0,0,0,0,0,0,0,0,0,0
1,train_1,41,16,0.02381,0.357143,0.166667,0.357143,0.119048,3,2.0,...,0,0,0,0,0,0,0,0,0,0
2,train_2,102,6,0.210526,0.464912,0.140351,0.175439,0.219298,0,2.035714,...,0,0,0,0,0,0,0,0,0,0
3,train_3,191,3,0.189573,0.379147,0.180095,0.236967,0.203791,3,2.293478,...,0,0,0,0,0,0,0,0,0,0
4,train_4,55,11,0.258065,0.112903,0.612903,0.209677,0.064516,4,2.296296,...,0,0,0,0,0,0,0,0,0,0


## 결측치 처리

In [9]:
train_ft.isnull().sum()

Unnamed: 0,0
ID,0
내점일수,0
구매주기,0
주말방문비율,0
봄_구매비율,0
...,...
주구매_중분류_총구매액,0
주구매_중분류_총구매액,0
주구매_중분류_총구매액,0
주구매_중분류_총구매액,0


In [10]:
test_ft.isnull().sum()

Unnamed: 0,0
ID,0
내점일수,0
구매주기,0
주말방문비율,0
봄_구매비율,0
...,...
주구매_중분류_총구매액,0
주구매_중분류_총구매액,0
주구매_중분류_총구매액,0
주구매_중분류_총구매액,0


In [11]:
train_ft = train_ft.fillna(0)
test_ft = test_ft.fillna(0)

train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## Feature Encoding

- ID 컬럼 제거

In [12]:
train_ft = train_ft.drop(columns="ID")
test_ft = test_ft.drop(columns="ID")

In [13]:
cols = train_ft.select_dtypes("object").columns.tolist()
cols

['주구매지점', '주구매_중분류', '주구매_대분류']

In [14]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown="ignore")

In [15]:
tmp = pd.DataFrame(
    enc.fit_transform(train_ft[cols]).toarray(),
    columns =  enc.get_feature_names_out()
)

train_ft = pd.concat([train_ft,tmp],axis=1).drop(columns=cols)
train_ft.head()

Unnamed: 0,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,거래개월수,...,주구매_대분류_여성캐주얼,주구매_대분류_여성캐쥬얼,주구매_대분류_영라이브,주구매_대분류_영어덜트캐쥬얼,주구매_대분류_영캐릭터,주구매_대분류_영플라자,주구매_대분류_잡화,주구매_대분류_잡화파트,주구매_대분류_케주얼_구두_아동,주구매_대분류_패션잡화
0,16,23,0.25,0.05,0.25,0.4,0.3,3,1.818182,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,41,16,0.02381,0.357143,0.166667,0.357143,0.119048,3,2.0,11,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,102,6,0.210526,0.464912,0.140351,0.175439,0.219298,0,2.035714,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,191,3,0.189573,0.379147,0.180095,0.236967,0.203791,3,2.293478,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,55,11,0.258065,0.112903,0.612903,0.209677,0.064516,4,2.296296,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
train_ft.columns

Index(['내점일수', '구매주기', '주말방문비율', '봄_구매비율', '여름_구매비율', '가을_구매비율', '겨울_구매비율',
       '주구매요일', '일별평균구매건수', '거래개월수',
       ...
       '주구매_대분류_여성캐주얼', '주구매_대분류_여성캐쥬얼', '주구매_대분류_영라이브', '주구매_대분류_영어덜트캐쥬얼',
       '주구매_대분류_영캐릭터', '주구매_대분류_영플라자', '주구매_대분류_잡화', '주구매_대분류_잡화파트',
       '주구매_대분류_케주얼_구두_아동', '주구매_대분류_패션잡화'],
      dtype='object', length=546)

In [17]:
tmp = pd.DataFrame(
    enc.transform(test_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
test_ft = pd.concat([test_ft,tmp],axis=1).drop(columns=cols)
test_ft.head()

Unnamed: 0,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,거래개월수,...,주구매_대분류_여성캐주얼,주구매_대분류_여성캐쥬얼,주구매_대분류_영라이브,주구매_대분류_영어덜트캐쥬얼,주구매_대분류_영캐릭터,주구매_대분류_영플라자,주구매_대분류_잡화,주구매_대분류_잡화파트,주구매_대분류_케주얼_구두_아동,주구매_대분류_패션잡화
0,7,32,0.571429,0.285714,0.285714,0.428571,0.0,1,1.4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,54,0.25,0.0,0.0,0.75,0.25,0,1.333333,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,8,0.0,0.0,0.6,0.4,0.0,0,2.5,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,28,16,0.03125,0.0625,0.46875,0.34375,0.125,3,2.0,7,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,19,29,0.47619,0.380952,0.285714,0.238095,0.095238,6,2.333333,4,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Scaling

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [27]:
# 데이터프레임의 전체 정보 확인
print(train_ft.info())

# 데이터프레임의 첫 몇 행 확인
print(train_ft.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14940 entries, 0 to 14939
Columns: 546 entries, 내점일수 to 주구매_대분류_패션잡화
dtypes: float64(286), int64(260)
memory usage: 62.2 MB
None
   내점일수  구매주기    주말방문비율    봄_구매비율   여름_구매비율   가을_구매비율   겨울_구매비율  주구매요일  \
0    16    23  0.250000  0.050000  0.250000  0.400000  0.300000      3   
1    41    16  0.023810  0.357143  0.166667  0.357143  0.119048      3   
2   102     6  0.210526  0.464912  0.140351  0.175439  0.219298      0   
3   191     3  0.189573  0.379147  0.180095  0.236967  0.203791      3   
4    55    11  0.258065  0.112903  0.612903  0.209677  0.064516      4   

   일별평균구매건수  거래개월수  ...  주구매_대분류_여성캐주얼  주구매_대분류_여성캐쥬얼  주구매_대분류_영라이브  \
0  1.818182      7  ...            0.0            0.0           0.0   
1  2.000000     11  ...            0.0            0.0           0.0   
2  2.035714     12  ...            0.0            0.0           0.0   
3  2.293478     12  ...            0.0            0.0           0.0   
4  2.296296     10  .

In [19]:
scaler = MinMaxScaler()
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
train_ft.head()

ValueError: Columns must be same length as key

In [28]:
# MinMaxScaler 설정
scaler = MinMaxScaler()

# 데이터 배치 크기 설정 (예: 5000개씩 처리)
batch_size = 5000

# 데이터프레임을 배치 단위로 나누어서 스케일링 적용
for start in range(0, len(train_ft), batch_size):
    end = min(start + batch_size, len(train_ft))
    train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])

# 결과 확인
train_ft.head()

  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[start:end, :] = scaler.fit_transform(train_ft.iloc[start:end, :])
  train_ft.iloc[

Unnamed: 0,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,거래개월수,...,주구매_대분류_여성캐주얼,주구매_대분류_여성캐쥬얼,주구매_대분류_영라이브,주구매_대분류_영어덜트캐쥬얼,주구매_대분류_영캐릭터,주구매_대분류_영플라자,주구매_대분류_잡화,주구매_대분류_잡화파트,주구매_대분류_케주얼_구두_아동,주구매_대분류_패션잡화
0,0.027778,0.130682,0.25,0.05,0.25,0.4,0.3,0.5,0.090909,0.545455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.074074,0.090909,0.02381,0.357143,0.166667,0.357143,0.119048,0.5,0.111111,0.909091,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.187037,0.034091,0.210526,0.464912,0.140351,0.175439,0.219298,0.0,0.115079,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.351852,0.017045,0.189573,0.379147,0.180095,0.236967,0.203791,0.5,0.14372,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.1,0.0625,0.258065,0.112903,0.612903,0.209677,0.064516,0.666667,0.144033,0.818182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 정답 데이터

In [30]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


# 데이터 분리(Data Split)
- 학습데이터와 검증데이터 분리

In [31]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train_ft, target, test_size=0.2, random_state=42)

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((11952, 546), (2988, 546), (11952,), (2988,))

# 모델 학습(Train Model)


In [32]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)
model.fit(x_train,y_train)

In [33]:
pred = model.predict(x_valid)
pred[:5]

array([0., 1., 1., 1., 0.])

In [34]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid,pred)

0.6653279785809906

- 1차 : 0.6653279785809906

# 모델 평가(Model Evaluation)

- 전체 train 데이터를 다시 학습한다.

In [41]:
model = LogisticRegression(random_state=42)
model.fit(train_ft,target)

- 테스트 데이터 예측

In [42]:
pred = model.predict(test_ft)
pred

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [None]:
submit["target"] = pred
submit

In [None]:
submit.to_csv("신혜빈.csv",index=False)