In [48]:
#Goal of this notebook: Given early engagement numbers (likes, shares, comments, diggs, upload hour, etc.) 
# → predict how many views the video will eventually reach.

import pandas as pd
import numpy as np

In [6]:
#reading the training data

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,url,digg_count,play_count,share_count,repost_count,collect_count,comment_count,video_id,author_id,duration,description,create_time,author_unique_id,location_created
0,https://www.tiktok.com/@zachking/video/1001169...,857800.0,1700000.0,476.0,0.0,335.0,1508.0,100116967235219456,6.86165e+16,0.0,When it's trash night at my house #dailylife,1464212460,zachking,
1,https://www.tiktok.com/@zachking/video/1164457...,1100000.0,2000000.0,637.0,0.0,467.0,1988.0,116445712837398528,6.86165e+16,15.0,#MamaSaid to always be a gentleman,1468105536,zachking,
2,https://www.tiktok.com/@zachking/video/1165721...,1000000.0,2100000.0,790.0,0.0,455.0,2683.0,116572195421646848,6.86165e+16,11.0,I've got #NoMoney ...oh wait,1468135692,zachking,
3,https://www.tiktok.com/@zachking/video/1185885...,1200000.0,2600000.0,3157.0,0.0,1616.0,7378.0,118588521136766976,6.86165e+16,13.0,#PikachuBeatDrop I found a pikachu,1468616422,zachking,
4,https://www.tiktok.com/@zachking/video/1214467...,806700.0,2300000.0,831.0,0.0,513.0,2287.0,121446730818019328,6.86165e+16,15.0,My heart will go on forever #badflutesong,1469297872,zachking,


In [7]:
#train_data information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2060 entries, 0 to 2059
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   url               2060 non-null   object 
 1   digg_count        2057 non-null   float64
 2   play_count        2057 non-null   float64
 3   share_count       2057 non-null   float64
 4   repost_count      2057 non-null   float64
 5   collect_count     2057 non-null   float64
 6   comment_count     2057 non-null   float64
 7   video_id          2060 non-null   int64  
 8   author_id         2057 non-null   float64
 9   duration          2057 non-null   float64
 10  description       1970 non-null   object 
 11  create_time       2060 non-null   int64  
 12  author_unique_id  2057 non-null   object 
 13  location_created  1999 non-null   object 
dtypes: float64(8), int64(2), object(4)
memory usage: 225.4+ KB


In [9]:
#from here we can see that 'url, video_id, author_id, author_unique_id, location_created' is not needed for calculation
#also 'description' field is very important, however for the scope of this feature we won't be doing NLP, so we ignore it

In [10]:
df_clean = df[[
    "digg_count",
    "share_count",
    "repost_count",
    "collect_count",
    "comment_count",
    "duration",
    "create_time",
    "play_count"
]].copy()

In [13]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2060 entries, 0 to 2059
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   digg_count     2057 non-null   float64
 1   share_count    2057 non-null   float64
 2   repost_count   2057 non-null   float64
 3   collect_count  2057 non-null   float64
 4   comment_count  2057 non-null   float64
 5   duration       2057 non-null   float64
 6   create_time    2060 non-null   int64  
 7   play_count     2057 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 128.9 KB


In [14]:
#there are only few rows which have missing/null data, we will just remove them because it won't create a significant 
#difference with our result

In [16]:
df_clean = df_clean.dropna()

In [17]:
df_clean

<class 'pandas.core.frame.DataFrame'>
Index: 2057 entries, 0 to 2059
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   digg_count     2057 non-null   float64
 1   share_count    2057 non-null   float64
 2   repost_count   2057 non-null   float64
 3   collect_count  2057 non-null   float64
 4   comment_count  2057 non-null   float64
 5   duration       2057 non-null   float64
 6   create_time    2057 non-null   int64  
 7   play_count     2057 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 144.6 KB


In [24]:
#FEATURE ENGINEERING: create_time has the date time basically, but we want the hour it was posted (cause that's more crucial)

In [26]:
from datetime import datetime

df_clean["upload_hour"] = df_clean["create_time"].apply(
    lambda x: datetime.fromtimestamp(x).hour
)

df_clean = df_clean.drop("create_time", axis=1)


In [31]:
df_clean.info()
df_clean['upload_hour']

<class 'pandas.core.frame.DataFrame'>
Index: 2057 entries, 0 to 2059
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   digg_count     2057 non-null   float64
 1   share_count    2057 non-null   float64
 2   repost_count   2057 non-null   float64
 3   collect_count  2057 non-null   float64
 4   comment_count  2057 non-null   float64
 5   duration       2057 non-null   float64
 6   play_count     2057 non-null   float64
 7   upload_hour    2057 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 144.6 KB


0        6
1        8
2       16
3        6
4        3
        ..
2055     0
2056     8
2057    11
2058    20
2059    22
Name: upload_hour, Length: 2057, dtype: int64

In [32]:
#We are going to do ratio features (powerful feature engineering)
#Because they show quality of engagement, not just quantity.

#Examples:
# - A video with 100 comments on 1000 likes is strong.
# - A video with 10 comments on 10,000 likes is weak.

In [37]:
df_clean["like_per_comment"] = df_clean["digg_count"] / (df_clean["comment_count"] + 1)
df_clean["share_per_like"] = df_clean["share_count"] / (df_clean["digg_count"] + 1)

# High value → people like but don’t comment → low interaction
# Low value → comments are high relative to likes → high interaction

# High → people share the video more than they like it → viral content
# Low → people don't find it shareable

In [38]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2057 entries, 0 to 2059
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   digg_count        2057 non-null   float64
 1   share_count       2057 non-null   float64
 2   repost_count      2057 non-null   float64
 3   collect_count     2057 non-null   float64
 4   comment_count     2057 non-null   float64
 5   duration          2057 non-null   float64
 6   play_count        2057 non-null   float64
 7   upload_hour       2057 non-null   int64  
 8   like_per_comment  2057 non-null   float64
 9   share_per_like    2057 non-null   float64
dtypes: float64(9), int64(1)
memory usage: 176.8 KB


In [39]:
from sklearn.model_selection import train_test_split

X = df_clean.drop("play_count", axis=1)
y = df_clean["play_count"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#80% = used to learn weights
#20% = used to test how good it really is

In [45]:
#We choose linear Regression as the FIRST MODEL
#Linear regression learns a formula like: y = w1x1 + w2x2 + w3x3... + wnxn
#so we if we use sklearn linear regression library, we can sort of find the best weights of the parameters to minimize the prediction error

In [41]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [42]:
#model tries to guess the play_count of unseen videos
preds = model.predict(X_test)

In [43]:
from sklearn.metrics import r2_score, mean_squared_error

print("R2:", r2_score(y_test, preds))
print("MSE:", mean_squared_error(y_test, preds))

R2: 0.45514502488875885
MSE: 799270478980231.0


In [44]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "weight": model.coef_
}).sort_values("weight", ascending=False)

coef_df

Unnamed: 0,feature,weight
6,upload_hour,190590.8
5,duration,55039.64
7,like_per_comment,3905.132
1,share_count,53.35454
0,digg_count,12.46486
2,repost_count,-3.941582e-05
4,comment_count,-1.931972
3,collect_count,-28.48711
8,share_per_like,-80102480.0


In [46]:
# our model got 45.5% of the variation in TikTok views, which is not so great but it's a good starting point
# ofcourse there are a lot of hidden parameters exist, but using this current data we can improve it more. 
# which we will see in the next part of the notebook

In [47]:
#here's what our next plan is:

In [None]:
# 1) Improve Linear Regression
# (Log transform, normalization, Ridge regression)

# 2) Use powerful models
# (Random Forest, XGBoost)

# 3) Add text analysis (NLP on description)
# TF-IDF → Expect accuracy jump.

# 4) Visualize predictions and errors