In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

def load_and_concatenate(files):
    # 각 파일별로 DataFrame을 생성하고, 파일명을 Source File 열로 추가
    dfs = [pd.read_excel(file).assign(**{'Source File': file}) for file in files]
    return pd.concat(dfs, ignore_index=True)


def extract_dates_from_filenames(filenames, dataframe):
    dates = []
    for filename in filenames:
        match = re.search(r'\d{8}', filename)  
        if match:
            date = datetime.strptime(match.group(), '%Y%m%d')
        else:
            date = np.nan
        dates.extend([date] * len(dataframe[dataframe['Source File'] == filename]))
    return dates

train_files = ['dataset/updated_trending_videos_final_20240806.xlsx', 'dataset/updated_trending_videos_final_20240811.xlsx', 'dataset/updated_trending_videos_final_20240815.xlsx', 'dataset/updated_trending_videos_final_20240819.xlsx']
valid_files = ['dataset/updated_trending_videos_final_20240807.xlsx', 'dataset/updated_trending_videos_final_20240812.xlsx']
test_files = ['dataset/updated_trending_videos_final_20240810.xlsx', 'dataset/updated_trending_videos_final_20240816.xlsx']


train_df = load_and_concatenate(train_files)
valid_df = load_and_concatenate(valid_files)
test_df = load_and_concatenate(test_files)

train_df['Upload Date'] = extract_dates_from_filenames(train_files, train_df)
valid_df['Upload Date'] = extract_dates_from_filenames(valid_files, valid_df)
test_df['Upload Date'] = extract_dates_from_filenames(test_files, test_df)


# Duration Category: 비디오 길이 범주화
def categorize_duration(duration):
    if duration < 300:  # 5분 미만
        return 'Short'
    elif duration < 1200:  # 5분 이상 20분 미만
        return 'Medium'
    else:  # 20분 이상
        return 'Long'

train_df['Duration Category'] = train_df['Duration'].apply(categorize_duration)
valid_df['Duration Category'] = valid_df['Duration'].apply(categorize_duration)
test_df['Duration Category'] = test_df['Duration'].apply(categorize_duration)

# Duration to Views Ratio: 비디오 길이를 조회수로 나눈 비율
train_df['Duration to Views Ratio'] = train_df['Duration'] / train_df['Views']
valid_df['Duration to Views Ratio'] = valid_df['Duration'] / valid_df['Views']
test_df['Duration to Views Ratio'] = test_df['Duration'] / test_df['Views']

# Likes to Views Ratio: 조회수 대비 좋아요의 비율
train_df['Likes to Views Ratio'] = train_df['Likes'] / train_df['Views']
valid_df['Likes to Views Ratio'] = valid_df['Likes'] / valid_df['Views']
test_df['Likes to Views Ratio'] = test_df['Likes'] / test_df['Views']

# Weekday/Weekend: 비디오가 평일에 업로드되었는지, 주말에 업로드되었는지 여부
def get_weekday_or_weekend(upload_date):
    if pd.isna(upload_date):
        return np.nan
    return 'Weekend' if upload_date.weekday() >= 5 else 'Weekday'

train_df['Weekday/Weekend'] = train_df['Upload Date'].apply(get_weekday_or_weekend)
valid_df['Weekday/Weekend'] = valid_df['Upload Date'].apply(get_weekday_or_weekend)
test_df['Weekday/Weekend'] = test_df['Upload Date'].apply(get_weekday_or_weekend)

# Day of Week: 업로드된 요일
train_df['Day of Week'] = train_df['Upload Date'].dt.day_name()
valid_df['Day of Week'] = valid_df['Upload Date'].dt.day_name()
test_df['Day of Week'] = test_df['Upload Date'].dt.day_name()

train_df = train_df.drop(columns=['Source File'])
valid_df = valid_df.drop(columns=['Source File'])
test_df = test_df.drop(columns=['Source File'])


print("Train Data:")
print(train_df.head())
print("\nValidation Data:")
print(valid_df.head())
print("\nTest Data:")
print(test_df.head())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.hist(train_df['Views'].dropna(), bins=20, edgecolor='k')
plt.title('Distribution of Views')
plt.xlabel('Views')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
# IQR 방법으로 이상치 제거
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

cleaned_train_df = remove_outliers(train_df, 'Views')

# 히스토그램 생성
plt.figure(figsize=(10, 6))
plt.hist(cleaned_train_df['Views'], bins=20, edgecolor='k')
plt.title('Distribution of Views for train data(Outliers Removed)')
plt.xlabel('Views')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
cleaned_valid_df = remove_outliers(valid_df, 'Views')

# 히스토그램 생성
plt.figure(figsize=(10, 6))
plt.hist(cleaned_valid_df['Views'], bins=20, edgecolor='k')
plt.title('Distribution of Views for valid data(Outliers Removed)')
plt.xlabel('Views')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
cleaned_test_df = remove_outliers(test_df, 'Views')

# 히스토그램 생성
plt.figure(figsize=(10, 6))
plt.hist(cleaned_test_df['Views'], bins=20, edgecolor='k')
plt.title('Distribution of Views for test data(Outliers Removed)')
plt.xlabel('Views')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
train_df = cleaned_train_df.sort_values('Views')  # 조회수 기준으로 오름차순 정렬
train_df = cleaned_train_df.drop_duplicates(subset='Video ID', keep='first')  # 중복된 Video ID에서 첫 번째(조회수가 낮은 것)만 남김

train_df.info()

In [None]:
valid_df = cleaned_valid_df.sort_values('Views')  # 조회수 기준으로 오름차순 정렬
valid_df = cleaned_valid_df.drop_duplicates(subset='Video ID', keep='first')  # 중복된 Video ID에서 첫 번째(조회수가 낮은 것)만 남김

valid_df.info()

In [None]:
test_df = cleaned_test_df.sort_values('Views')  # 조회수 기준으로 오름차순 정렬
test_df = cleaned_test_df.drop_duplicates(subset='Video ID', keep='first')  # 중복된 Video ID에서 첫 번째(조회수가 낮은 것)만 남김

test_df.info()

In [31]:
# 범주형 변수(Video ID, Title)는 큰 영향을 안 미칠 것으로 보아 drop
drop_cols = ['Video ID', 'Title','Upload Date']
# train_df_num = augmented_train_df.drop(columns=drop_cols)
# valid_df_num = augmented_valid_df.drop(columns=drop_cols)
# test_df_num = augmented_test_df.drop(columns=drop_cols)

train_df_num = train_df.drop(columns=drop_cols)
valid_df_num = valid_df.drop(columns=drop_cols)
test_df_num = test_df.drop(columns=drop_cols)

In [32]:
train_df_num.dropna(axis=0, inplace=True)
valid_df_num.dropna(axis=0, inplace=True)
test_df_num.dropna(axis=0, inplace=True)

In [33]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 수치형 변수 목록
numerical_cols = ['Likes', 'Duration', 'Duration to Views Ratio', 'Likes to Views Ratio']

# 범주형 변수들을 원핫 인코딩
train_df_encoded = pd.get_dummies(train_df_num, columns=['Category Name', 'Duration Category', 'Weekday/Weekend', 'Day of Week'])
valid_df_encoded = pd.get_dummies(valid_df_num, columns=['Category Name', 'Duration Category', 'Weekday/Weekend', 'Day of Week'])
test_df_encoded = pd.get_dummies(test_df_num, columns=['Category Name', 'Duration Category', 'Weekday/Weekend', 'Day of Week'])

# train, valid, test 데이터셋의 열을 일관되게 유지
valid_df_encoded = valid_df_encoded.reindex(columns=train_df_encoded.columns, fill_value=0)
test_df_encoded = test_df_encoded.reindex(columns=train_df_encoded.columns, fill_value=0)

# StandardScaler를 사용하여 수치형 변수만 스케일링
scaler = StandardScaler()

# 수치형 변수만 스케일링하여 새로운 DataFrame 생성
train_df_scaled = train_df_encoded.copy()
train_df_scaled[numerical_cols] = scaler.fit_transform(train_df_encoded[numerical_cols])

valid_df_scaled = valid_df_encoded.copy()
valid_df_scaled[numerical_cols] = scaler.transform(valid_df_encoded[numerical_cols])

test_df_scaled = test_df_encoded.copy()
test_df_scaled[numerical_cols] = scaler.transform(test_df_encoded[numerical_cols])



In [34]:
# 특성(feature)과 타겟(target) 설정
X_train = train_df_scaled.drop(columns=['Views'])
y_train = train_df_scaled['Views']
X_valid = valid_df_scaled.drop(columns=['Views'])
y_valid = valid_df_scaled['Views']
X_test = test_df_scaled.drop(columns=['Views'])
y_test = test_df_scaled['Views']

In [35]:
# 공통 칼럼만 사용
common_cols = X_train.columns.intersection(X_valid.columns)

X_train_common = X_train[common_cols]
X_valid_common = X_valid[common_cols]
X_test_common = X_test[common_cols]

In [None]:
print(common_cols)

In [None]:
# 평가지표 출력을 위한 패키지 불러오기
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score 

# 평가지표를 출력하는 함수 만들기
result_df = pd.DataFrame(columns = ["MSE", "RMSE", "MAE", "SMAPE", "R2"])

def print_reg_result(model_name, data_type, valid_y, valid_pred):
    MSE = mean_squared_error(valid_y, valid_pred)
    RMSE = root_mean_squared_error(valid_y, valid_pred)
    MAE = mean_absolute_error(valid_y, valid_pred)
    SMAPE = np.mean((np.abs(valid_y - valid_pred))/(np.abs(valid_y) + np.abs(valid_pred)))*100
    R2 = r2_score(valid_y, valid_pred)
    result_df.loc[len(result_df)] = [model_name, data_type, MSE, RMSE, MAE, SMAPE, R2]

In [None]:
# 트리 기반의 회귀 알고리즘 패키지 불러오기
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# 새로운 하이퍼파라미터로 모델 선언하기
RF_reg = RandomForestRegressor(max_depth = 36, max_features='sqrt', min_samples_leaf=2, min_samples_split=3, n_estimators=322)
ET_reg = ExtraTreesRegressor(max_depth=30, max_features='sqrt', min_samples_leaf=1, min_samples_split=5, n_estimators=416)
CAT_reg = CatBoostRegressor(border_count=124, depth=4, iterations=136, l2_leaf_reg=8, learning_rate=0.1092250914011541)
LGBM_reg = LGBMRegressor(learning_rate = 0.034478254120072105, max_depth = 16, min_child_samples=4, n_estimators=222, num_leaves=39, subsample=0.8599855723111061)
XGB_reg = XGBRegressor(colsample_bytree=0.6464290562027665, learning_rate=0.023800792606525824, max_depth=3, min_child_weight=4, n_estimators=253, subsample=0.6068644407327001)

In [None]:
model_list = [('Random Forest', RF_reg),
              ('Extra Trees', ET_reg),
              ('Cat Boost', CAT_reg),
              ('Light GBM', LGBM_reg),
              ('XG Boost', XGB_reg)
              ]


result_df = pd.DataFrame(columns=["Model", "Data Type", "MSE", "RMSE", "MAE", "SMAPE", "R2"])

for model_name, model in model_list:
    
    # Fit the model on the entire training data
    model.fit(X_train_common, y_train)
    
    # Predict on validation and test data
    train_pred = model.predict(X_train_common)
    valid_pred = model.predict(X_valid_common)
    test_pred = model.predict(X_test_common)
    
    # Output the evaluation metrics
    print_reg_result(model_name, 'Train', y_train, train_pred)
    print_reg_result(model_name, 'Validation', y_valid, valid_pred)
    print_reg_result(model_name, 'Test', y_test, test_pred)

In [None]:
print(result_df)

In [None]:
# Random Forest 모델의 변수 중요도 구하기
train_pred = XGB_reg.predict(X_train_common)
valid_pred = XGB_reg.predict(X_valid_common)
test_pred = XGB_reg.predict(X_test_common)

feature_imp_df = pd.DataFrame(XGB_reg.feature_importances_, index = X_train_common.columns)
feature_imp_df.reset_index(inplace = True)
feature_imp_df.columns = ["Youtube Factor", "Feature Importance"]

In [None]:
# 변수 중요도가 높은 순서대로 공공도서관 요인 정렬하기
feature_imp_df = feature_imp_df.sort_values("Feature Importance", ascending = False)

In [None]:
# 변수 중요도의 누적합을 담은 열 추가하기
feature_imp_df["Feature Importance"] = feature_imp_df["Feature Importance"]/feature_imp_df["Feature Importance"].sum()
cum_feature_imp = feature_imp_df["Feature Importance"].cumsum()
feature_imp_df.insert(2, "Cumsum of Feature Importance", cum_feature_imp)

In [None]:
# 변수 중요도의 누적합 계산
feature_imp_df["Cumsum of Feature Importance"] = feature_imp_df["Feature Importance"].cumsum()

In [None]:
# 데이터프레임의 인덱스 초기화하기
feature_imp_df.reset_index(inplace = True)
feature_imp_df.drop(columns = "index", inplace= True)

In [None]:
# 변수 중요도에 대한 최종 데이터프레임 출력하기
feature_imp_df

In [None]:
# 각 변수의 상대적 중요도 시각화하기 (Feature Importance Plot)
import seaborn as sns
import matplotlib.pyplot as plt

# 상위 15개 특성만 선택하기
top_20_features = feature_imp_df.nlargest(20, 'Feature Importance')

# 시각화하기
fig = plt.figure(figsize=(10, 8))
sns.barplot(y=top_20_features["Youtube Factor"], x=top_20_features["Feature Importance"], palette="YlGnBu")
plt.title("Top 20 Feature Importance of XG Boost")
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.show()

## Summary Plot & Force Plot 시각화

### Random Forest

In [None]:
import shap

explainer = shap.TreeExplainer(RF_reg)
shap_values = explainer.shap_values(X_test_common)

In [None]:
shap.summary_plot(shap_values, X_test, max_display=10)
plt.show()

In [None]:
shap.force_plot(explainer.expected_value, shap_values.values[6], X_test.iloc[0], matplotlib=True, figsize=(45, 5))
plt.show()

### Extra Trees

In [None]:
explainer = shap.TreeExplainer(ET_reg)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, max_display=10)
plt.show()

In [None]:
shap.force_plot(explainer.expected_value, shap_values.values[6], X_test.iloc[0], matplotlib=True, figsize=(45, 5))
plt.show()

### CatBoost

In [None]:
explainer = shap.TreeExplainer(CAT_reg)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, max_display=10)
plt.show()

In [None]:
shap.force_plot(explainer.expected_value, shap_values.values[6], X_test.iloc[0], matplotlib=True, figsize=(45, 5))
plt.show()

### LightGBM

In [None]:
explainer = shap.TreeExplainer(LGBM_reg)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, max_display=10)
plt.show()

In [None]:
shap.force_plot(explainer.expected_value, shap_values.values[6], X_test.iloc[0], matplotlib=True, figsize=(45, 5))
plt.show()

### XGBoost

In [None]:
explainer = shap.TreeExplainer(XGB_reg)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, max_display=10)
plt.show()

In [None]:
shap.force_plot(explainer.expected_value, shap_values.values[6], X_test.iloc[0], matplotlib=True, figsize=(45, 3))
plt.show()