In [16]:
import numpy as np
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import os
import psutil

pd.options.display.max_columns=1000
pd.options.display.max_rows=1000

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

# 전처리

In [17]:
path = "/home/sgh/yes/envs/DACON/dataset/click_ratio/"
train = pl.read_csv(path + "train.csv")
test = pl.read_csv(path + "test.csv")
sample_submission = pl.read_csv(path + "sample_submission.csv")

In [18]:
# 시스템 메모리 정보 가져오기
mem = psutil.virtual_memory()
# 남은 램 용량 확인
available_memory = mem.available / (1024 ** 3)  # GB 단위로 변환
print("Available Memory:", available_memory, "GB")

Available Memory: 32.31309127807617 GB


In [4]:
print(train.shape)
print(test.shape)

(28605391, 41)
(4538541, 40)


In [5]:
label_counts = train["Click"].value_counts()
label_counts

Click,count
i64,u32
0,23035531
1,5569860


## Feature_Engineering

결측치

In [3]:
dfs = [train,test]
# 각 열을 순회하며 결측치의 비율 계산
for df in dfs:
    print(df)
    for col in df.columns:
        # 결측치 비율 계산
        missing_percentage = df[col].null_count() / len(df) * 100
        print(f"{col}: {missing_percentage:.2f}% missing")
    print('_____________')

shape: (28_605_391, 41)
┌────────────────┬───────┬─────────┬─────────┬───┬──────┬─────────┬─────┬─────────┐
│ ID             ┆ Click ┆ F01     ┆ F02     ┆ … ┆ F36  ┆ F37     ┆ F38 ┆ F39     │
│ ---            ┆ ---   ┆ ---     ┆ ---     ┆   ┆ ---  ┆ ---     ┆ --- ┆ ---     │
│ str            ┆ i64   ┆ str     ┆ str     ┆   ┆ f64  ┆ str     ┆ f64 ┆ str     │
╞════════════════╪═══════╪═════════╪═════════╪═══╪══════╪═════════╪═════╪═════════╡
│ TRAIN_00000000 ┆ 1     ┆ NSLHFNS ┆ AVKQTCL ┆ … ┆ null ┆ TFJMLCZ ┆ 0.0 ┆ AURZYDY │
│ TRAIN_00000001 ┆ 0     ┆ VGIVWZQ ┆ LSUSMVO ┆ … ┆ 19.0 ┆ AUGTURV ┆ 0.0 ┆ LUZRMLU │
│ TRAIN_00000002 ┆ 0     ┆ JCDXFYU ┆ PILDDJU ┆ … ┆ 8.0  ┆ ZVSTLNM ┆ 0.0 ┆ MHBRSQK │
│ TRAIN_00000003 ┆ 1     ┆ PSMFWTP ┆ ZYAVJHP ┆ … ┆ 14.0 ┆ ZBSRLCQ ┆ 0.0 ┆ GAZBSSZ │
│ TRAIN_00000004 ┆ 0     ┆ SLCRICD ┆ QPQWGXA ┆ … ┆ 13.0 ┆ QHYLSBX ┆ 0.0 ┆ QTATWAY │
│ …              ┆ …     ┆ …       ┆ …       ┆ … ┆ …    ┆ …       ┆ …   ┆ …       │
│ TRAIN_28605386 ┆ 0     ┆ NLVEHEJ ┆ TQLUNHQ ┆ … ┆ 6

In [19]:
def filter_cols(df): #결측치가 35%이상 되는 피처들 제외
    for col in df.columns:
        if col not in ["ID"]:
            missing_percentage = df[col].null_count() / len(df) * 100
            if missing_percentage > 35:
                print(col)
                df = df.drop(col)
    print("____________________________________________")
    for col in df.columns: #클래스 수가 1 혹은 200이 넘는 카테고리 변수들 제거
        if (col not in ["ID"]) & (df[col].dtype == pl.String):
            freq = df[col].n_unique()
            if (freq == 1) | (freq > 4538541):
                print(col)
                df = df.drop(col)        
    return df

In [20]:
def fill_missing_values(df):
    """
    폴라스 데이터프레임에서 열을 순회하며,
    문자형 열의 결측치를 "nan"으로 채우고,
    수치형 열의 결측치를 0으로 채움
    """
    for column in df.columns:
        if df[column].dtype == 'String':  # 문자형 열인 경우
            df = df.with_columns(pl.col(column).fill_null(np.nan))
        else:  # 수치형 열인 경우
            df = df.with_columns(pl.col(column).fill_null(0))
    return df

In [21]:
train = filter_cols(train)

F03
F15
F20
F26
F27
F29
____________________________________________
F01
F05


In [22]:
test = filter_cols(test)

F03
F15
F20
F26
F27
F29
____________________________________________


In [23]:
test = test.drop(['F01','F05']) #test 전체 개수보다 많은 class인 열 제거 

In [7]:
#train = fill_missing_values(train)
#test = fill_missing_values(test)

모델링을 위해 판다스로 변환

In [24]:
#category 변환
def to_pandas(df_data, categorical_feat=None): #판다스로 바꾸기. 문자형 변수들 카테고리로 변환
    df_data = df_data.to_pandas()
    if categorical_feat is None:
        categorical_feat = list(df_data.select_dtypes("object").columns) #카테고리 컬럼들 지정
        categorical_feat.remove('ID')
    return df_data, categorical_feat

In [25]:
train, categorical_feat1 = to_pandas(train)
test, categorical_feat2 = to_pandas(test) 

In [26]:
categorical_feat1

['F02',
 'F07',
 'F08',
 'F09',
 'F10',
 'F12',
 'F13',
 'F16',
 'F17',
 'F21',
 'F22',
 'F23',
 'F25',
 'F28',
 'F30',
 'F31',
 'F34',
 'F35',
 'F37',
 'F39']

In [27]:
categorical_feat2

['F02',
 'F07',
 'F08',
 'F09',
 'F10',
 'F12',
 'F13',
 'F16',
 'F17',
 'F21',
 'F22',
 'F23',
 'F25',
 'F28',
 'F30',
 'F31',
 'F34',
 'F35',
 'F37',
 'F39']

In [28]:
# 시스템 메모리 정보 가져오기
mem = psutil.virtual_memory()

# 남은 램 용량 확인
available_memory = mem.available / (1024 ** 3)  # GB 단위로 변환
print("Available Memory:", available_memory, "GB")

Available Memory: 38.12531280517578 GB


## 메모리 최적화

In [29]:
def reduce_mem_usage(df): #판다스에 적용할 것
    
    # 메모리 최적화
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [30]:
train = reduce_mem_usage(train)

Memory usage of dataframe is 7201.98 MB
Memory usage after optimization is: 5428.77 MB
Decreased by 24.6%


In [31]:
test = reduce_mem_usage(test)

Memory usage of dataframe is 1108.04 MB
Memory usage after optimization is: 848.34 MB
Decreased by 23.4%


In [34]:
# 시스템 메모리 정보 가져오기
mem = psutil.virtual_memory()

# 남은 램 용량 확인
available_memory = mem.available / (1024 ** 3)  # GB 단위로 변환
print("Available Memory:", available_memory, "GB")

Available Memory: 40.082908630371094 GB


In [35]:
target = "Click"

X_train = train.drop(columns = [target,'ID'])
y_train = train[target]

X_test = test.drop(columns = 'ID')
test_id = test['ID']

In [37]:
import psutil

# 시스템 메모리 정보 가져오기
mem = psutil.virtual_memory()

# 남은 램 용량 확인
available_memory = mem.available / (1024 ** 3)  # GB 단위로 변환
print("Available Memory:", available_memory, "GB")

Available Memory: 34.22330093383789 GB


In [39]:
from sklearn.model_selection import train_test_split

# 클래스 불균형이 있는 경우 계층적 샘플링을 이용하여 데이터 분할
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [40]:
import psutil

# 시스템 메모리 정보 가져오기
mem = psutil.virtual_memory()

# 남은 램 용량 확인
available_memory = mem.available / (1024 ** 3)  # GB 단위로 변환
print("Available Memory:", available_memory, "GB")

Available Memory: 28.673316955566406 GB


# Models

## Catboost

In [41]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22884312 entries, 13803063 to 6118378
Data columns (total 31 columns):
 #   Column  Dtype  
---  ------  -----  
 0   F02     object 
 1   F04     float32
 2   F06     int32  
 3   F07     object 
 4   F08     object 
 5   F09     object 
 6   F10     object 
 7   F11     float16
 8   F12     object 
 9   F13     object 
 10  F14     int16  
 11  F16     object 
 12  F17     object 
 13  F18     float16
 14  F19     float16
 15  F21     object 
 16  F22     object 
 17  F23     object 
 18  F24     float32
 19  F25     object 
 20  F28     object 
 21  F30     object 
 22  F31     object 
 23  F32     float32
 24  F33     float16
 25  F34     object 
 26  F35     object 
 27  F36     float16
 28  F37     object 
 29  F38     float16
 30  F39     object 
dtypes: float16(6), float32(3), int16(1), int32(1), object(20)
memory usage: 4.2+ GB


In [42]:
X_train1.isna().sum()

F02     987796
F04    4595069
F06          0
F07          0
F08          0
F09          0
F10     987796
F11    2365001
F12     987796
F13          0
F14          0
F16          0
F17          0
F18    5860562
F19    2071800
F21          0
F22          0
F23          0
F24    7195434
F25          0
F28          0
F30          0
F31          0
F32     201167
F33    2071800
F34     987796
F35          0
F36    5860562
F37          0
F38     640631
F39          0
dtype: int64

In [43]:
#catb가 다룰 카테고리 변수에는 nan 값이 허용되지 않는다. 따라서 str로 바꿔준 뒤 catb에 넘겨준다.
#str로 바꾸면 결측치 nan이 또 하나의 그냥 카테고리가 되는 효과가 있음

X_train1[categorical_feat1] = X_train1[categorical_feat1].astype(str)
X_test1[categorical_feat1] = X_test1[categorical_feat1].astype(str)

X_test[categorical_feat1] = X_test[categorical_feat1].astype(str)

In [44]:
X_train1.isna().sum()

F02          0
F04    4595069
F06          0
F07          0
F08          0
F09          0
F10          0
F11    2365001
F12          0
F13          0
F14          0
F16          0
F17          0
F18    5860562
F19    2071800
F21          0
F22          0
F23          0
F24    7195434
F25          0
F28          0
F30          0
F31          0
F32     201167
F33    2071800
F34          0
F35          0
F36    5860562
F37          0
F38     640631
F39          0
dtype: int64

In [46]:
import psutil

# 시스템 메모리 정보 가져오기
mem = psutil.virtual_memory()

# 남은 램 용량 확인
available_memory = mem.available / (1024 ** 3)  # GB 단위로 변환
print("Available Memory:", available_memory, "GB")

Available Memory: 28.302654266357422 GB


In [47]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(X_train1, y_train1 ,cat_features = categorical_feat1)
val_pool = Pool(X_test1, y_test1, cat_features = categorical_feat1)


catb_params = {
    'iterations' : 200,
    'learning_rate': 0.05 ,
    "max_depth": 10,
    'eval_metric' :'AUC', #GPU 이용시 AUC말고 다른 매트릭으로 평가됨 
    'task_type' : 'GPU', #device는 굳이 전달 안 해도 None이 디폴트로, 자동으로 지정된다.
    'verbose': 1,
    'random_seed' : 721,
    'gpu_ram_part': 0.9 #gpu의 80%만 사용하라. default = 0.95 (95%)
    #'used_ram_limit': '20gb' #cpu가 20gb만 이용하도록 함
    }

catb = CatBoostClassifier(**catb_params)
catb.fit(train_pool, eval_set=val_pool, verbose=True)

# 예측 확률 계산
#y_pred = catb.predict_proba(X_test1)[:,1]

# ROC AUC 평가
#score = roc_auc_score(y_test1, y_pred)
#print(score)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7243258	best: 0.7243258 (0)	total: 2.91s	remaining: 9m 40s
1:	total: 6.15s	remaining: 10m 9s
2:	total: 9.14s	remaining: 10m
3:	total: 12.7s	remaining: 10m 24s
4:	total: 15.9s	remaining: 10m 21s
5:	test: 0.7363940	best: 0.7363940 (5)	total: 19.2s	remaining: 10m 21s
6:	total: 22.2s	remaining: 10m 12s
7:	total: 25.9s	remaining: 10m 22s
8:	total: 29.2s	remaining: 10m 20s
9:	total: 32.9s	remaining: 10m 24s
10:	test: 0.7396747	best: 0.7396747 (10)	total: 36.4s	remaining: 10m 25s
11:	total: 39.8s	remaining: 10m 23s
12:	total: 43.1s	remaining: 10m 19s
13:	total: 46.2s	remaining: 10m 14s
14:	total: 48.9s	remaining: 10m 3s
15:	test: 0.7426370	best: 0.7426370 (15)	total: 52.1s	remaining: 9m 58s
16:	total: 55.4s	remaining: 9m 56s
17:	total: 58.7s	remaining: 9m 53s
18:	total: 1m 1s	remaining: 9m 47s
19:	total: 1m 4s	remaining: 9m 41s
20:	test: 0.7459271	best: 0.7459271 (20)	total: 1m 8s	remaining: 9m 41s
21:	total: 1m 11s	remaining: 9m 35s
22:	total: 1m 14s	remaining: 9m 32s
23:	total: 1

<catboost.core.CatBoostClassifier at 0x7f3714dbf340>

In [24]:
import joblib

# 모델 저장 경로
model_path = "/home/sgh/yes/envs/DACON/models/"

# 경로가 존재하지 않으면 생성
os.makedirs(model_path, exist_ok=True)

# 모델 저장
catb.save_model(os.path.join(model_path, "click_ratio_catb(vanilla).pth"))

In [48]:
print(X_test.columns)
print(X_train1.columns)

Index(['F02', 'F04', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 'F13',
       'F14', 'F16', 'F17', 'F18', 'F19', 'F21', 'F22', 'F23', 'F24', 'F25',
       'F28', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'F36', 'F37', 'F38',
       'F39'],
      dtype='object')
Index(['F02', 'F04', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 'F13',
       'F14', 'F16', 'F17', 'F18', 'F19', 'F21', 'F22', 'F23', 'F24', 'F25',
       'F28', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'F36', 'F37', 'F38',
       'F39'],
      dtype='object')


In [43]:
#sample_submission = sample_submission.to_pandas()

y_submission_pred2 = catb.predict_proba(X_test)[:,1]

sample_submission['Click'] = y_submission_pred2
submission = sample_submission.copy()

submission.to_csv('/home/sgh/yes/envs/DACON/click_ratio(catb2).csv',index=False)