In [1]:
import numpy as np
import pandas as pd
from tableone import TableOne


# 소수를 e로 표현하지 않도록 하기
pd.options.display.float_format = "{:.5f}".format

In [2]:
lib_2007 = pd.read_csv("final_data/lib_2007.csv")
lib_2008 = pd.read_csv("final_data/lib_2008.csv")
lib_2009 = pd.read_csv("final_data/lib_2009.csv")
lib_2010 = pd.read_csv("final_data/lib_2010.csv")
lib_2011 = pd.read_csv("final_data/lib_2011.csv")
lib_2012 = pd.read_csv("final_data/lib_2012.csv")
lib_2013 = pd.read_csv("final_data/lib_2013.csv")
lib_2014 = pd.read_csv("final_data/lib_2014.csv")
lib_2015 = pd.read_csv("final_data/lib_2015.csv")
lib_2016 = pd.read_csv("final_data/lib_2016.csv")
lib_2017 = pd.read_csv("final_data/lib_2017.csv")
lib_2018 = pd.read_csv("final_data/lib_2018.csv")
lib_2019 = pd.read_csv("final_data/lib_2019.csv")
lib_2020 = pd.read_csv("final_data/lib_2020.csv")
lib_2021 = pd.read_csv("final_data/lib_2021.csv")

In [3]:
train_lib = pd.concat([lib_2007,lib_2008,lib_2009,lib_2010,lib_2011,lib_2012,lib_2013,lib_2014,lib_2015, lib_2016, lib_2017])
valid_lib = pd.concat([lib_2018, lib_2019])
test_lib = pd.concat([lib_2020,lib_2021])

In [14]:
print(train_lib['self_srv_machines'].value_counts())

self_srv_machines
0     4126
2      749
1      703
3      659
5      369
4      339
6      157
7      110
8       28
9       18
11      14
10       6
13       5
12       4
15       2
Name: count, dtype: int64


In [4]:
# 사분위수를 활용하여 비율 충족도의 이상치 행 제거하기
def del_outlier(df, col):
    q1 = df[col].quantile(0.10)
    q3 = df[col].quantile(0.90)
    iqr = q3 - q1 
    boundary = 1.5 * iqr 

    upper_index = df[df[col] > q3 + boundary].index
    lower_index = df[df[col] < q1 - boundary].index 

    df.drop(upper_index, inplace = True)
    df.drop(lower_index, inplace = True)

    return df

del_outlier(train_lib, "Future_fullfillment")
del_outlier(valid_lib, "Future_fullfillment")
del_outlier(test_lib, "Future_fullfillment") 

# 이상치 제거 후 train, test set의 데이터 수 구하기
print(f"이상치 제거 후 train set의 데이터 수 : {len(train_lib)}")
print(f"이상치 제거 후 valid set의 데이터 수 : {len(valid_lib)}")
print(f"이상치 제거 후 test set의 데이터 수 : {len(test_lib)}")

이상치 제거 후 train set의 데이터 수 : 7289
이상치 제거 후 valid set의 데이터 수 : 2172
이상치 제거 후 test set의 데이터 수 : 2325


In [5]:

def transform_target(y, min_value=0, max_value=10):
    # 0~1 사이의 값을 min_value~max_value 사이로 스케일 조정
    y_scaled = min_value + (max_value - min_value) * y
    
    # 값을 가장 가까운 정수로 변환
    y_integer = np.round(y_scaled).astype(int)
    
    # min_value~max_value 사이의 값으로 보장
    y_integer = np.clip(y_integer, 0, 5)
    
    return y_integer

# 변환 함수 적용
train_lib["Future_fullfillment"] = transform_target(train_lib["Future_fullfillment"], min_value=0, max_value=10)
valid_lib["Future_fullfillment"] = transform_target(valid_lib["Future_fullfillment"], min_value=0, max_value=10)
test_lib["Future_fullfillment"] = transform_target(test_lib["Future_fullfillment"], min_value=0, max_value=10)

In [6]:
# 인덱스 리셋
train_lib = train_lib.reset_index(drop=True)
valid_lib = valid_lib.reset_index(drop=True)
test_lib = test_lib.reset_index(drop=True)

In [15]:
categorical_vars = []  # 범주형 변수가 없다고 명시

In [17]:
train_table = TableOne(train_lib, categorical=categorical_vars, groupby='Future_fullfillment', pval=True)
print(train_table)

                                  Grouped by Future_fullfillment                                                                                                                                                                
                                                         Missing               Overall                      0                     1                    2                    3                     4                    5 P-Value
n                                                                                 7289                    772                  4365                 1546                  395                   150                   61        
dom_books, mean (SD)                                           0     91813.9 (80208.4)      99268.1 (89546.0)    103233.6 (84281.7)    69872.2 (63671.4)    57514.8 (47236.0)     55073.5 (47472.7)    48853.6 (38814.4)  <0.001
for_books, mean (SD)                                           0       2671.5 (6109.9)        2590.2

In [18]:
valid_table = TableOne(valid_lib, groupby='Future_fullfillment', pval=True)
print(valid_table)

                                  Grouped by Future_fullfillment                                                                                                                                            
                                                         Missing                Overall                      0                      1                    2                    3                    4 P-Value
n                                                                                  2172                    202                   1463                  371                  108                   28        
dom_books, mean (SD)                                           0      97723.1 (83706.5)    116203.0 (102145.9)     104952.5 (86137.9)    72717.2 (60372.3)    58873.6 (49162.9)    67842.5 (59073.8)  <0.001
for_books, mean (SD)                                           0        4522.5 (8611.4)       5378.3 (15518.2)        4723.8 (7551.3)      4268.2 (8465.2)      1967.5 (3605.4)     

In [19]:
test_table = TableOne(test_lib, groupby='Future_fullfillment', pval=True)
print(test_table)

                                  Grouped by Future_fullfillment                                                                                                                                             
                                                         Missing                Overall                      0                      1                     2                    3                    4 P-Value
n                                                                                  2325                    257                   1652                   348                   58                   10        
dom_books, mean (SD)                                           0      96552.7 (84251.5)    119537.8 (106368.5)      99830.3 (82868.1)     74234.4 (69797.6)    44025.2 (30581.7)    45704.8 (25518.6)  <0.001
for_books, mean (SD)                                           0        4756.5 (7523.9)        4990.3 (8952.1)        4968.7 (7484.2)       4016.3 (6947.3)      2344.3 (4315.6)