## Using PyCaret for Like Prediction with Numerical/Categorical Features

In [1]:
!pip install --pre pycaret



## Data Preprocessing

### Read File

In [2]:
import pandas as pd
import numpy as np
train = pd.read_csv('../../raw_data/intern_homework_train_dataset.csv')
test = pd.read_csv('../../raw_data/intern_homework_public_test_dataset.csv')
train.head()

Unnamed: 0,title,created_at,like_count_1h,like_count_2h,like_count_3h,like_count_4h,like_count_5h,like_count_6h,comment_count_1h,comment_count_2h,comment_count_3h,comment_count_4h,comment_count_5h,comment_count_6h,forum_id,author_id,forum_stats,like_count_24h
0,我的排骨湯,2022-10-05 14:20:21 UTC,12,15,15,15,16,18,10,10,10,10,10,10,598518,428921,0.7,26
1,#請益 婚禮穿搭,2022-10-05 14:28:13 UTC,0,0,3,4,4,4,2,5,8,9,9,9,399302,650840,63.9,11
2,無謂的啦啦隊,2022-10-06 07:18:22 UTC,3,7,8,11,12,14,1,1,2,3,3,3,650776,717288,19.2,19
3,文學理論 課本,2022-09-20 11:39:14 UTC,2,7,11,24,26,26,2,2,8,32,38,63,471023,173889,7.9,29
4,一般課程,2022-09-05 10:18:24 UTC,3,7,7,10,10,11,15,26,35,38,48,49,230184,594332,36.2,16


In [3]:
train = train
test = test

### Feature Selection

挑選 Label

In [4]:
train_label = train['like_count_24h']
test_label = test['like_count_24h']
train_label

0        26
1        11
2        19
3        29
4        16
         ..
49995    11
49996     5
49997    13
49998    11
49999     5
Name: like_count_24h, Length: 50000, dtype: int64

挑選 Input

不考慮 文章標題 / 作者 ID / 看板 ID

In [5]:
# 指定要刪除的 column names，並使用 drop 函數將這些 column 刪除
drop_columns = ['author_id', 'forum_id', 'title']

train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)

### Data Transfromations

處理 created_by Feature

In [6]:
# 將文章發佈時間 拆成 星期幾 與 小時 的函數
def split_date(df, date_column):

    # 將 created_by 欄位轉換成日期格式
    df[date_column] = pd.to_datetime(df[date_column], utc=True)
    
    # 新增 星期幾 和 小時 欄位
    df['weekday'] = df[date_column].dt.weekday
    df['hour'] = df[date_column].dt.hour

    # 移除 created_by 欄位
    df = df.drop(date_column, axis=1)

    # 回傳處理過的資料集
    return df

In [7]:
train = split_date(train, 'created_at')
test = split_date(test, 'created_at')

# 顯示處理過的資料集
train.head()

Unnamed: 0,like_count_1h,like_count_2h,like_count_3h,like_count_4h,like_count_5h,like_count_6h,comment_count_1h,comment_count_2h,comment_count_3h,comment_count_4h,comment_count_5h,comment_count_6h,forum_stats,like_count_24h,weekday,hour
0,12,15,15,15,16,18,10,10,10,10,10,10,0.7,26,2,14
1,0,0,3,4,4,4,2,5,8,9,9,9,63.9,11,2,14
2,3,7,8,11,12,14,1,1,2,3,3,3,19.2,19,3,7
3,2,7,11,24,26,26,2,2,8,32,38,63,7.9,29,1,11
4,3,7,7,10,10,11,15,26,35,38,48,49,36.2,16,0,10


## Auto Sklearn Training

In [8]:
from pycaret.regression import *

s = setup(train, target = 'like_count_24h')

In [9]:
best = compare_models()

In [10]:
print(best)

ExtraTreesRegressor(n_jobs=-1, random_state=6523)


In [11]:
finalize_model(best)

In [12]:
predict_model(best, test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,21.7812,12566.024,112.0983,0.5518,0.6307,0.6752


Unnamed: 0,like_count_1h,like_count_2h,like_count_3h,like_count_4h,like_count_5h,like_count_6h,comment_count_1h,comment_count_2h,comment_count_3h,comment_count_4h,comment_count_5h,comment_count_6h,forum_stats,weekday,hour,like_count_24h,prediction_label
0,2,7,7,12,13,14,0,1,1,1,2,2,48.599998,5,12,16,26.32
1,1,1,1,1,3,4,18,18,23,26,27,28,389.100006,3,0,8,33.92
2,2,2,3,3,3,4,4,4,8,9,11,11,389.100006,2,7,8,17.00
3,0,0,1,2,3,3,0,5,8,14,16,16,32.700001,4,5,6,11.49
4,12,21,39,54,77,107,6,16,21,21,23,28,44.599998,6,4,211,286.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,5,8,8,8,8,8,1,1,1,1,1,1,88.199997,2,15,20,15.43
9996,3,6,12,13,14,15,5,7,8,11,11,11,50.599998,0,12,18,30.01
9997,2,2,2,3,3,3,2,2,3,3,3,3,72.900002,3,14,6,14.53
9998,0,0,1,1,2,2,0,0,0,0,0,0,1.500000,0,10,5,7.71


In [13]:
predictions = predict_model(best, test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,21.7812,12566.024,112.0983,0.5518,0.6307,0.6752
