In [4]:
#Goal of this notebook: Given early engagement numbers (likes, shares, comments, diggs, upload hour, etc.) 
# → predict how many views the video will eventually reach.

import pandas as pd
import numpy as np

In [5]:
#reading the training data

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,url,digg_count,play_count,share_count,repost_count,collect_count,comment_count,video_id,author_id,duration,description,create_time,author_unique_id,location_created
0,https://www.tiktok.com/@zachking/video/1001169...,857800.0,1700000.0,476.0,0.0,335.0,1508.0,100116967235219456,6.86165e+16,0.0,When it's trash night at my house #dailylife,1464212460,zachking,
1,https://www.tiktok.com/@zachking/video/1164457...,1100000.0,2000000.0,637.0,0.0,467.0,1988.0,116445712837398528,6.86165e+16,15.0,#MamaSaid to always be a gentleman,1468105536,zachking,
2,https://www.tiktok.com/@zachking/video/1165721...,1000000.0,2100000.0,790.0,0.0,455.0,2683.0,116572195421646848,6.86165e+16,11.0,I've got #NoMoney ...oh wait,1468135692,zachking,
3,https://www.tiktok.com/@zachking/video/1185885...,1200000.0,2600000.0,3157.0,0.0,1616.0,7378.0,118588521136766976,6.86165e+16,13.0,#PikachuBeatDrop I found a pikachu,1468616422,zachking,
4,https://www.tiktok.com/@zachking/video/1214467...,806700.0,2300000.0,831.0,0.0,513.0,2287.0,121446730818019328,6.86165e+16,15.0,My heart will go on forever #badflutesong,1469297872,zachking,


In [6]:
df_clean = df[[
    "digg_count",
    "share_count",
    "repost_count",
    "collect_count",
    "comment_count",
    "duration",
    "create_time",
    "play_count"
]].copy()

In [7]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2060 entries, 0 to 2059
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   digg_count     2057 non-null   float64
 1   share_count    2057 non-null   float64
 2   repost_count   2057 non-null   float64
 3   collect_count  2057 non-null   float64
 4   comment_count  2057 non-null   float64
 5   duration       2057 non-null   float64
 6   create_time    2060 non-null   int64  
 7   play_count     2057 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 128.9 KB


In [8]:
#there are only few rows which have missing/null data, we will just remove them because it won't create a significant 
#difference with our result

In [9]:
df_clean = df_clean.dropna()

In [10]:
# Data cleaned - ready for feature engineering
#FEATURE ENGINEERING: create_time has the date time basically, but we want the hour it was posted (cause that's more crucial)

In [11]:
from datetime import datetime

df_clean["upload_hour"] = df_clean["create_time"].apply(
    lambda x: datetime.fromtimestamp(x).hour
)

df_clean = df_clean.drop("create_time", axis=1)


In [12]:
df_clean.info()
df_clean['upload_hour']

<class 'pandas.core.frame.DataFrame'>
Index: 2057 entries, 0 to 2059
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   digg_count     2057 non-null   float64
 1   share_count    2057 non-null   float64
 2   repost_count   2057 non-null   float64
 3   collect_count  2057 non-null   float64
 4   comment_count  2057 non-null   float64
 5   duration       2057 non-null   float64
 6   play_count     2057 non-null   float64
 7   upload_hour    2057 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 144.6 KB


0        6
1        8
2       16
3        6
4        3
        ..
2055     0
2056     8
2057    11
2058    20
2059    22
Name: upload_hour, Length: 2057, dtype: int64

In [13]:
#We are going to do ratio features (powerful feature engineering)
#Because they show quality of engagement, not just quantity.

#Examples:
# - A video with 100 comments on 1000 likes is strong.
# - A video with 10 comments on 10,000 likes is weak.

In [14]:
df_clean["like_per_comment"] = df_clean["digg_count"] / (df_clean["comment_count"] + 1)
df_clean["share_per_like"] = df_clean["share_count"] / (df_clean["digg_count"] + 1)

# High value → people like but don’t comment → low interaction
# Low value → comments are high relative to likes → high interaction

# High → people share the video more than they like it → viral content
# Low → people don't find it shareable

In [15]:
df_clean = df_clean.drop("repost_count", axis=1)

In [16]:
from sklearn.model_selection import train_test_split

X = df_clean.drop("play_count", axis=1)

# Log transform target to make distribution more normal
y_log = np.log1p(df_clean["play_count"])
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

In [17]:
# Scale features for better model performance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


In [18]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [19]:
# XG BOOOOOOOOOOST - best model for tabular data 
# similar to Random forest tree, however every small decision tree learns from its mistake made by the previous

In [20]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [27]:
# 1. Create the model
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

In [29]:

# 2. Train
xgb.fit(X_train_scaled, y_train_log)

# 3. Predict in log space
preds_log = xgb.predict(X_test_scaled)

# 4. Convert back to original space
preds = np.expm1(preds_log)
actual = np.expm1(y_test_log)

# 5. Evaluate (log space is the real metric)
print("R2 (log space):", r2_score(y_test_log, preds_log))
print("MSE (log space):", mean_squared_error(y_test_log, preds_log))

# Optional: original scale
print("R2 (original scale):", r2_score(actual, preds))

R2 (log space): 0.895263277435608
MSE (log space): 0.129052467343924
R2 (original scale): 0.44849467004454213


In [None]:
# will try to fine tune the XGBoost by finding the optimal parameters

In [34]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    "n_estimators": [300, 500, 800],
    "max_depth": [3, 4, 5, 6, 7],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.7, 1.0],
    "reg_alpha": [0, 0.1, 0.5],
    "reg_lambda": [1, 1.5, 2]
}

In [31]:
xgb_base = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    tree_method="auto"
)

In [35]:
xgb_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_grid,
    n_iter=20,                # number of random combinations
    scoring="r2",
    cv=3,                    # 3-fold cross validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [36]:
xgb_search.fit(X_train_scaled, y_train_log)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,param_distributions,"{'colsample_bytree': [0.5, 0.7, ...], 'learning_rate': [0.01, 0.03, ...], 'max_depth': [3, 4, ...], 'n_estimators': [300, 500, ...], ...}"
,n_iter,20
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False


In [37]:
print("Best Parameters:", xgb_search.best_params_)
print("Best CV Score (log-R2):", xgb_search.best_score_)

Best Parameters: {'subsample': 0.6, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
Best CV Score (log-R2): 0.8936172993669782


In [38]:
best_xgb = xgb_search.best_estimator_

preds_log = best_xgb.predict(X_test_scaled)

# convert back to original scale
preds = np.expm1(preds_log)
actual = np.expm1(y_test_log)

print("R2 (log space):", r2_score(y_test_log, preds_log))
print("MSE (log space):", mean_squared_error(y_test_log, preds_log))
print("R2 (original space):", r2_score(actual, preds))

R2 (log space): 0.9010838061973746
MSE (log space): 0.1218806408864897
R2 (original space): 0.5033297195846135
