In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 소수를 e로 표현하지 않도록 하기
pd.options.display.float_format = "{:.5f}".format

In [2]:
lib_2007 = pd.read_csv("final_data/lib_2007.csv", index_col=0)
lib_2008 = pd.read_csv("final_data/lib_2008.csv", index_col=0)
lib_2009 = pd.read_csv("final_data/lib_2009.csv", index_col=0)
lib_2010 = pd.read_csv("final_data/lib_2010.csv", index_col=0)
lib_2011 = pd.read_csv("final_data/lib_2011.csv", index_col=0)
lib_2012 = pd.read_csv("final_data/lib_2012.csv", index_col=0)
lib_2013 = pd.read_csv("final_data/lib_2013.csv", index_col=0)
lib_2014 = pd.read_csv("final_data/lib_2014.csv", index_col=0)
lib_2015 = pd.read_csv("final_data/lib_2015.csv", index_col=0)
lib_2016 = pd.read_csv("final_data/lib_2016.csv", index_col=0)
lib_2017 = pd.read_csv("final_data/lib_2017.csv", index_col=0)
lib_2018 = pd.read_csv("final_data/lib_2018.csv", index_col=0)
lib_2019 = pd.read_csv("final_data/lib_2019.csv", index_col=0)
lib_2020 = pd.read_csv("final_data/lib_2020.csv", index_col=0)
lib_2021 = pd.read_csv("final_data/lib_2021.csv", index_col=0)

In [3]:
train_lib = pd.concat([lib_2007,lib_2008,lib_2009,lib_2010,lib_2011,lib_2012,lib_2013,lib_2014,lib_2015, lib_2016, lib_2017])
valid_lib = pd.concat([lib_2018, lib_2019])
test_lib = pd.concat([lib_2020,lib_2021])

In [4]:
# 사분위수를 활용하여 비율 충족도의 이상치 행 제거하기
def del_outlier(df, col):
    q1 = df[col].quantile(0.10)
    q3 = df[col].quantile(0.90)
    iqr = q3 - q1 
    boundary = 1.5 * iqr 

    upper_index = df[df[col] > q3 + boundary].index
    lower_index = df[df[col] < q1 - boundary].index 

    df.drop(upper_index, inplace = True)
    df.drop(lower_index, inplace = True)

    return df

del_outlier(train_lib, "future_acq_budget_settlement")
del_outlier(valid_lib, "future_acq_budget_settlement")
del_outlier(test_lib, "future_acq_budget_settlement") 

# 이상치 제거 후 train, test set의 데이터 수 구하기
print(f"이상치 제거 후 train set의 데이터 수 : {len(train_lib)}")
print(f"이상치 제거 후 valid set의 데이터 수 : {len(valid_lib)}")
print(f"이상치 제거 후 test set의 데이터 수 : {len(test_lib)}")

이상치 제거 후 train set의 데이터 수 : 8184
이상치 제거 후 valid set의 데이터 수 : 2178
이상치 제거 후 test set의 데이터 수 : 2310


In [5]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max Scaler 생성
scaler = MinMaxScaler()

# 모든 변수에 대해 Min-Max Scaling 적용
# 스케일러를 학습 데이터에 대해 fit
scaler.fit(train_lib.drop(columns=["future_acq_budget_settlement"]))

# 학습 데이터에 대해 변환
train_lib_scaled = pd.DataFrame(scaler.transform(train_lib.drop(columns=["future_acq_budget_settlement"])), 
                                columns=train_lib.columns[:-1])

# 타겟 변수에 대해 스케일링 적용
scaler_y = MinMaxScaler()
train_y_scaled = scaler_y.fit_transform(train_lib[['future_acq_budget_settlement']])

# 학습 데이터에 스케일된 타겟 변수 추가
train_lib_scaled['future_acq_budget_settlement_scaled'] = train_y_scaled

# 검증 데이터에 대해 변환
valid_lib_scaled = pd.DataFrame(scaler.transform(valid_lib.drop(columns=["future_acq_budget_settlement"])), 
                                columns=train_lib.columns[:-1])
valid_y_scaled = scaler_y.transform(valid_lib[['future_acq_budget_settlement']])
valid_lib_scaled['future_acq_budget_settlement_scaled'] = valid_y_scaled

# 테스트 데이터에 대해 변환
test_lib_scaled = pd.DataFrame(scaler.transform(test_lib.drop(columns=["future_acq_budget_settlement"])), 
                               columns=train_lib.columns[:-1])
test_y_scaled = scaler_y.transform(test_lib[['future_acq_budget_settlement']])
test_lib_scaled['future_acq_budget_settlement_scaled'] = test_y_scaled

# X_train, Y_train 구성하기 (스케일된 타겟 사용)
X_train = train_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_train = train_lib_scaled["future_acq_budget_settlement_scaled"]

# X_valid, Y_valid 구성하기 (스케일된 타겟 사용)
X_valid = valid_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_valid = valid_lib_scaled["future_acq_budget_settlement_scaled"]

# X_test, Y_test 구성하기 (스케일된 타겟 사용)
X_test = test_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_test = test_lib_scaled["future_acq_budget_settlement_scaled"]


In [6]:
import statsmodels.api as sm

In [10]:
X_df = sm.add_constant(X_train)

# 회귀 모델 피팅
model = sm.OLS(Y_train, X_df).fit()

# p-value 출력
print(model.summary())

# p-value가 0.05보다 작은 특성 추출
p_values = model.pvalues
significant_features = p_values[p_values < 0.001].index

print("유의미한 특성:")
print(significant_features)

                                     OLS Regression Results                                    
Dep. Variable:     future_acq_budget_settlement_scaled   R-squared:                       0.694
Model:                                             OLS   Adj. R-squared:                  0.693
Method:                                  Least Squares   F-statistic:                     513.4
Date:                                 Wed, 11 Sep 2024   Prob (F-statistic):               0.00
Time:                                         15:59:16   Log-Likelihood:                 8157.5
No. Observations:                                 8184   AIC:                        -1.624e+04
Df Residuals:                                     8147   BIC:                        -1.598e+04
Df Model:                                           36                                         
Covariance Type:                             nonrobust                                         
                            coef    std 

In [8]:
X_df_val = sm.add_constant(X_valid)

# 회귀 모델 피팅
model = sm.OLS(Y_valid, X_df_val).fit()

# p-value 출력
print(model.summary())

# p-value가 0.05보다 작은 특성 추출
p_values = model.pvalues
significant_features = p_values[p_values < 0.01].index

print("유의미한 특성:")
print(significant_features)

                                     OLS Regression Results                                    
Dep. Variable:     future_acq_budget_settlement_scaled   R-squared:                       0.574
Model:                                             OLS   Adj. R-squared:                  0.566
Method:                                  Least Squares   F-statistic:                     80.02
Date:                                 Wed, 11 Sep 2024   Prob (F-statistic):               0.00
Time:                                         15:25:33   Log-Likelihood:                 1749.7
No. Observations:                                 2178   AIC:                            -3425.
Df Residuals:                                     2141   BIC:                            -3215.
Df Model:                                           36                                         
Covariance Type:                             nonrobust                                         
                            coef    std 

In [9]:
X_df_test = sm.add_constant(X_test)

# 회귀 모델 피팅
model = sm.OLS(Y_test, X_df_test).fit()

# p-value 출력
print(model.summary())

# p-value가 0.05보다 작은 특성 추출
p_values = model.pvalues
significant_features = p_values[p_values < 0.001].index

print("유의미한 특성:")
print(significant_features)

                                     OLS Regression Results                                    
Dep. Variable:     future_acq_budget_settlement_scaled   R-squared:                       0.724
Model:                                             OLS   Adj. R-squared:                  0.720
Method:                                  Least Squares   F-statistic:                     165.8
Date:                                 Wed, 11 Sep 2024   Prob (F-statistic):               0.00
Time:                                         15:25:33   Log-Likelihood:                 2398.9
No. Observations:                                 2310   AIC:                            -4724.
Df Residuals:                                     2273   BIC:                            -4511.
Df Model:                                           36                                         
Covariance Type:                             nonrobust                                         
                            coef    std 