## Using Auto-Sklearn for Like Prediction with Numerical/Categorical Features

In [1]:
!pip install auto-sklearn



## Data Preprocessing

### Read File

In [2]:
import pandas as pd
import numpy as np
train = pd.read_csv('../../raw_data/intern_homework_train_dataset.csv')
test = pd.read_csv('../../raw_data/intern_homework_public_test_dataset.csv')
train.head()

Unnamed: 0,title,created_at,like_count_1h,like_count_2h,like_count_3h,like_count_4h,like_count_5h,like_count_6h,comment_count_1h,comment_count_2h,comment_count_3h,comment_count_4h,comment_count_5h,comment_count_6h,forum_id,author_id,forum_stats,like_count_24h
0,我的排骨湯,2022-10-05 14:20:21 UTC,12,15,15,15,16,18,10,10,10,10,10,10,598518,428921,0.7,26
1,#請益 婚禮穿搭,2022-10-05 14:28:13 UTC,0,0,3,4,4,4,2,5,8,9,9,9,399302,650840,63.9,11
2,無謂的啦啦隊,2022-10-06 07:18:22 UTC,3,7,8,11,12,14,1,1,2,3,3,3,650776,717288,19.2,19
3,文學理論 課本,2022-09-20 11:39:14 UTC,2,7,11,24,26,26,2,2,8,32,38,63,471023,173889,7.9,29
4,一般課程,2022-09-05 10:18:24 UTC,3,7,7,10,10,11,15,26,35,38,48,49,230184,594332,36.2,16


In [3]:
train = train
test = test

### Feature Selection

挑選 Label

In [4]:
train_label = train['like_count_24h']
test_label = test['like_count_24h']
train_label

0        26
1        11
2        19
3        29
4        16
         ..
49995    11
49996     5
49997    13
49998    11
49999     5
Name: like_count_24h, Length: 50000, dtype: int64

挑選 Input

不考慮 文章標題 / 作者 ID / 看板 ID

In [5]:
# 指定要刪除的 column names，並使用 drop 函數將這些 column 刪除
drop_columns = ['author_id', 'like_count_24h', 'forum_id', 'title']

train_input = train.drop(drop_columns, axis=1)
test_input = test.drop(drop_columns, axis=1)

### Data Transfromations

處理 created_by Feature

In [6]:
# 將文章發佈時間 拆成 星期幾 與 小時 的函數
def split_date(df, date_column):

    # 將 created_by 欄位轉換成日期格式
    df[date_column] = pd.to_datetime(df[date_column], utc=True)
    
    # 新增 星期幾 和 小時 欄位
    df['weekday'] = df[date_column].dt.weekday
    df['hour'] = df[date_column].dt.hour

    # 移除 created_by 欄位
    df = df.drop(date_column, axis=1)

    # 回傳處理過的資料集
    return df

In [7]:
train_input = split_date(train_input, 'created_at')
test_input = split_date(test_input, 'created_at')

# 顯示處理過的資料集
train_input.head()

Unnamed: 0,like_count_1h,like_count_2h,like_count_3h,like_count_4h,like_count_5h,like_count_6h,comment_count_1h,comment_count_2h,comment_count_3h,comment_count_4h,comment_count_5h,comment_count_6h,forum_stats,weekday,hour
0,12,15,15,15,16,18,10,10,10,10,10,10,0.7,2,14
1,0,0,3,4,4,4,2,5,8,9,9,9,63.9,2,14
2,3,7,8,11,12,14,1,1,2,3,3,3,19.2,3,7
3,2,7,11,24,26,26,2,2,8,32,38,63,7.9,1,11
4,3,7,7,10,10,11,15,26,35,38,48,49,36.2,0,10


## Auto Sklearn Training

In [8]:
import autosklearn.regression
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error

In [10]:
# 將 X_train, y_train 轉換成 NumPy
X_train_np = train_input.to_numpy()
y_train_np = train_label.to_numpy()

In [13]:
y_train_np

array([26, 11, 19, ..., 13, 11,  5])

In [23]:
reg = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=3600,
    per_run_time_limit=360,
    )
reg.fit(X_train_np, y_train_np)



AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                     per_run_time_limit=360)

In [24]:
print(reg.leaderboard())

          rank  ensemble_weight               type      cost    duration
model_id                                                                
12           1             0.28        extra_trees  0.230904  234.825906
10           2             0.30     ard_regression  0.234016    1.663405
107          3             0.08     ard_regression  0.238049    0.650381
46           4             0.20     ard_regression  0.241541    0.980115
15           5             0.14  gradient_boosting  0.292709    2.717881


In [27]:
pred = reg.predict(test_input)
MAPE = mean_absolute_percentage_error(test_label.to_numpy(), pred)
print("MAPE:", MAPE)

MAPE: 0.7175264923431005


In [33]:
test_label.to_numpy()

array([16,  8,  8, ...,  6,  5, 19])

In [34]:
pred

array([32.26272988, 32.62079746, 19.24459457, ..., 12.25134993,
        6.7863678 , 27.35593581])