In [24]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Step 1: Load the Data

In [25]:
df = pd.read_csv('data/videos_featured.csv', sep=';') # Load data to pandas dataframe df

### Step 2: Basic Checks

In [26]:
df.head() # View first 5 rows

Unnamed: 0,video_id,title,published_at,views,likes,comments,tags,duration_seconds,published_us_est,published_year,published_month,published_quarter,published_weekday,published_hour,like_ratio,comment_ratio,published_time_of_day
0,FGC0cCAgGu0,Twitter making me tear up over here 🥹,2024-02-23 13:34:27,7502,305,12,[],16,2024-02-23 08:34:27,2024,2,Q1,Friday,8,0.0407,0.0016,Morning
1,UOBTLzWY1vs,#DataAnalyst #AnalystBuilder #SQL,2024-03-01 13:43:29,21921,1515,20,[],38,2024-03-01 08:43:29,2024,3,Q1,Friday,8,0.0691,0.0009,Morning
2,k8nBWL6K884,What is Healthcare Analytics?,2025-02-18 13:01:11,26399,1303,79,"['Data Analyst', 'Data Analyst job', 'Data Ana...",656,2025-02-18 08:01:11,2025,2,Q1,Tuesday,8,0.0494,0.003,Morning
3,r5512UY3MTc,1 Million Subscriber Livestream!! Giveaways + ...,2025-03-05 17:07:10,10268,675,68,"['Data Analyst', 'Data Analyst job', 'Data Ana...",6014,2025-03-05 12:07:10,2025,3,Q1,Wednesday,12,0.0657,0.0066,Afternoon
4,XuOcmjIbFGg,"Taking a look at Real Healthcare Data | ICD11,...",2025-03-04 13:00:44,8534,402,27,"['Data Analyst', 'Data Analyst job', 'Data Ana...",916,2025-03-04 08:00:44,2025,3,Q1,Tuesday,8,0.0471,0.0032,Morning


In [27]:
df.shape # Check the number of rows and columns

(366, 17)

In [28]:
df.columns # Column names

Index(['video_id', 'title', 'published_at', 'views', 'likes', 'comments',
       'tags', 'duration_seconds', 'published_us_est', 'published_year',
       'published_month', 'published_quarter', 'published_weekday',
       'published_hour', 'like_ratio', 'comment_ratio',
       'published_time_of_day'],
      dtype='object')

In [29]:
df.dtypes # Check data type of each column

video_id                  object
title                     object
published_at              object
views                      int64
likes                      int64
comments                   int64
tags                      object
duration_seconds           int64
published_us_est          object
published_year             int64
published_month            int64
published_quarter         object
published_weekday         object
published_hour             int64
like_ratio               float64
comment_ratio            float64
published_time_of_day     object
dtype: object

In [30]:
df.info() # Check nulls + types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   video_id               366 non-null    object 
 1   title                  366 non-null    object 
 2   published_at           366 non-null    object 
 3   views                  366 non-null    int64  
 4   likes                  366 non-null    int64  
 5   comments               366 non-null    int64  
 6   tags                   366 non-null    object 
 7   duration_seconds       366 non-null    int64  
 8   published_us_est       366 non-null    object 
 9   published_year         366 non-null    int64  
 10  published_month        366 non-null    int64  
 11  published_quarter      366 non-null    object 
 12  published_weekday      366 non-null    object 
 13  published_hour         366 non-null    int64  
 14  like_ratio             366 non-null    float64
 15  commen

### Step 3: Metrics Normalization

In [None]:
# Convert all date time to UTC time zone
df['published_at'] = pd.to_datetime(df['published_at'], utc=True)
extract_day = pd.Timestamp('2025-06-19', tz='UTC') # The day that videos are extracted is 2025-06-19

# Normalize metrics
df['days_since_publish'] = (extract_day - df['published_at']).dt.days.clip(lower=1) # type: ignore # ensure the minimal day is 1
df['views_per_day'] = df['views']/df['days_since_publish']
df['likes_per_day'] = df['likes']/df['days_since_publish']


# Label duration buckets
df['duration_bucket'] = pd.cut(df['duration_seconds'], 
    bins=[0, 600, 1800, 3600, 100000],
    labels=['Short', 'Medium', 'Long', 'Very Long']
)

In [32]:
df.head()

Unnamed: 0,video_id,title,published_at,views,likes,comments,tags,duration_seconds,published_us_est,published_year,...,published_quarter,published_weekday,published_hour,like_ratio,comment_ratio,published_time_of_day,days_since_publish,views_per_day,likes_per_day,duration_bucket
0,FGC0cCAgGu0,Twitter making me tear up over here 🥹,2024-02-23 13:34:27+00:00,7502,305,12,[],16,2024-02-23 08:34:27,2024,...,Q1,Friday,8,0.0407,0.0016,Morning,481,15.596674,0.634096,Short
1,UOBTLzWY1vs,#DataAnalyst #AnalystBuilder #SQL,2024-03-01 13:43:29+00:00,21921,1515,20,[],38,2024-03-01 08:43:29,2024,...,Q1,Friday,8,0.0691,0.0009,Morning,474,46.246835,3.196203,Short
2,k8nBWL6K884,What is Healthcare Analytics?,2025-02-18 13:01:11+00:00,26399,1303,79,"['Data Analyst', 'Data Analyst job', 'Data Ana...",656,2025-02-18 08:01:11,2025,...,Q1,Tuesday,8,0.0494,0.003,Morning,120,219.991667,10.858333,Medium
3,r5512UY3MTc,1 Million Subscriber Livestream!! Giveaways + ...,2025-03-05 17:07:10+00:00,10268,675,68,"['Data Analyst', 'Data Analyst job', 'Data Ana...",6014,2025-03-05 12:07:10,2025,...,Q1,Wednesday,12,0.0657,0.0066,Afternoon,105,97.790476,6.428571,Very Long
4,XuOcmjIbFGg,"Taking a look at Real Healthcare Data | ICD11,...",2025-03-04 13:00:44+00:00,8534,402,27,"['Data Analyst', 'Data Analyst job', 'Data Ana...",916,2025-03-04 08:00:44,2025,...,Q1,Tuesday,8,0.0471,0.0032,Morning,106,80.509434,3.792453,Medium


### Step 4: Descriptive Statistics
#### Statistics for numerical data

In [33]:
df.describe() # Summary for numeric columns

Unnamed: 0,views,likes,comments,duration_seconds,published_year,published_month,published_hour,like_ratio,comment_ratio,days_since_publish,views_per_day,likes_per_day
count,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0
mean,139042.5,3328.486339,180.199454,1700.691257,2022.297814,6.327869,8.34153,0.033302,0.003144,978.745902,167.875263,4.558059
std,259787.7,5760.090393,385.038146,4786.842221,1.570741,3.537144,2.555847,0.016346,0.005271,567.570341,328.838787,10.125244
min,259.0,34.0,3.0,9.0,2020.0,1.0,0.0,0.012,0.0002,1.0,0.971779,0.057013
25%,12921.5,435.0,30.0,386.25,2021.0,3.0,7.0,0.021825,0.0008,526.25,19.984098,0.71345
50%,35681.0,1332.5,77.0,721.0,2022.0,6.0,8.0,0.02955,0.0016,907.5,60.352281,1.978091
75%,139755.5,3597.75,161.75,1581.5,2024.0,9.75,9.0,0.0393,0.003175,1526.5,175.858997,4.407832
max,2174630.0,48329.0,4370.0,84768.0,2025.0,12.0,23.0,0.1313,0.0638,1979.0,2937.263158,125.521053


#### Interpretation:
- Huge difference between **mean** and **median**-> strong right skew in `views` and `likes`
- Outliers are pulling the average up
- Maximum of duration(84,768 s, about 23.5 hrs!) suggests at least one extremely long video
- Most of the videos are between 6.5-26 minutes
- Most videos published between morning hours

#### Suggestions:
- Use **log scale** when visualizing `views` and `likes`
- Bin durations into categories: Short, Medium, Long, Very Long
- User `views_per_day` and `likes_per_day` for grouped trend analyses


#### Statistics for non-numerical data

In [34]:
weekday_counts = df['published_weekday'].value_counts() # Published total videos of each weekday
weekday_summary = pd.DataFrame(
    {'Video Count': weekday_counts,
     '% of Total': (weekday_counts/weekday_counts.sum()*100).round(1)        
    }
)

weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_summary = weekday_summary.reindex(weekday_order)

print(weekday_summary)

                   Video Count  % of Total
published_weekday                         
Monday                      19         5.2
Tuesday                    238        65.0
Wednesday                   28         7.7
Thursday                    55        15.0
Friday                      17         4.6
Saturday                     6         1.6
Sunday                       3         0.8


In [35]:
hour_counts = df['published_hour'].value_counts() # Published total videos of each hour
hour_summary = pd.DataFrame(
    {'Video_count':hour_counts,
     '% of Total': (hour_counts/hour_counts.sum()*100).round(1)        
    }
)

hour_order = [0,6,7,8,9,10,11,12,13,14,19,21,22,23]
hour_summary = hour_summary.reindex(hour_order)

print(hour_summary)

                Video_count  % of Total
published_hour                         
0                         2         0.5
6                        50        13.7
7                        88        24.0
8                       128        35.0
9                        23         6.3
10                       24         6.6
11                       19         5.2
12                       19         5.2
13                        4         1.1
14                        2         0.5
19                        1         0.3
21                        3         0.8
22                        2         0.5
23                        1         0.3


In [37]:
daytime_counts = df['published_time_of_day'].value_counts() # Published total videos of each time of day
daytime_summary = pd.DataFrame(
    {'Video_counts': daytime_counts,
     '% of Total': (daytime_counts/daytime_counts.sum()*100).round(1)
        
    }
)

daytime_order = ['Early Morning', 'Morning', 'Afternoon', 'Evening'] # Early Morning(0-5), Morning(6-11), Afternoon(12-17), Evening(18-23)
daytime_summary = daytime_summary.reindex(daytime_order)

print(daytime_summary)


                       Video_counts  % of Total
published_time_of_day                          
Early Morning                     2         0.5
Morning                         332        90.7
Afternoon                        25         6.8
Evening                           7         1.9


#### Analyzing Content Topics

In [38]:
import ast

# Step 1: Clean and parse the tags_array into actual lists
df['tags_clean'] = df['tags'].apply(lambda x: list(ast.literal_eval(x)))

# Step 2: Explode into multiple rows
df_tags = df.explode('tags_clean').rename(columns={'tags_clean': 'tag'})


In [39]:
df_tags.head()

Unnamed: 0,video_id,title,published_at,views,likes,comments,tags,duration_seconds,published_us_est,published_year,...,published_weekday,published_hour,like_ratio,comment_ratio,published_time_of_day,days_since_publish,views_per_day,likes_per_day,duration_bucket,tag
0,FGC0cCAgGu0,Twitter making me tear up over here 🥹,2024-02-23 13:34:27+00:00,7502,305,12,[],16,2024-02-23 08:34:27,2024,...,Friday,8,0.0407,0.0016,Morning,481,15.596674,0.634096,Short,
1,UOBTLzWY1vs,#DataAnalyst #AnalystBuilder #SQL,2024-03-01 13:43:29+00:00,21921,1515,20,[],38,2024-03-01 08:43:29,2024,...,Friday,8,0.0691,0.0009,Morning,474,46.246835,3.196203,Short,
2,k8nBWL6K884,What is Healthcare Analytics?,2025-02-18 13:01:11+00:00,26399,1303,79,"['Data Analyst', 'Data Analyst job', 'Data Ana...",656,2025-02-18 08:01:11,2025,...,Tuesday,8,0.0494,0.003,Morning,120,219.991667,10.858333,Medium,Data Analyst
2,k8nBWL6K884,What is Healthcare Analytics?,2025-02-18 13:01:11+00:00,26399,1303,79,"['Data Analyst', 'Data Analyst job', 'Data Ana...",656,2025-02-18 08:01:11,2025,...,Tuesday,8,0.0494,0.003,Morning,120,219.991667,10.858333,Medium,Data Analyst job
2,k8nBWL6K884,What is Healthcare Analytics?,2025-02-18 13:01:11+00:00,26399,1303,79,"['Data Analyst', 'Data Analyst job', 'Data Ana...",656,2025-02-18 08:01:11,2025,...,Tuesday,8,0.0494,0.003,Morning,120,219.991667,10.858333,Medium,Data Analyst Career


In [40]:
df_tags.shape

(3642, 22)

In [41]:
tag_stats = df_tags.groupby('tag').agg({
    'views_per_day': 'mean',
    'likes_per_day': 'mean',
    'video_id': 'count'  # tag usage count
}).rename(columns={'video_id': 'video_count'}).reset_index()


In [42]:
tag_stats.head()

Unnamed: 0,tag,views_per_day,likes_per_day,video_count
0,100k livestream,3.380071,0.135943,1
1,100k subs alex the analyst,1.935833,0.171593,1
2,100k subscriber,1.935833,0.171593,1
3,100k subscriber play button unboxing,1.935833,0.171593,1
4,100k with no experience,33.590978,1.13482,1


In [43]:
# Only keep tags used in at least 5 videos
tag_stats_filtered = tag_stats[tag_stats['video_count'] >= 5]


In [44]:
tag_stats_filtered.head()

Unnamed: 0,tag,views_per_day,likes_per_day,video_count
17,Alex The Analyst,189.853892,5.187059,288
85,Data Analyst,170.955838,4.679277,340
88,Data Analyst Career,171.250382,4.70418,337
125,Data Analyst Salary,157.95582,4.459419,6
152,Data Analyst job,173.741768,4.742422,336


### Step 5: Visualize Distributions and Patterns

In [None]:
# Views distribution
sns.histplot(df['views_per_day'], bins=50) # type: ignore
plt.title("Distribution of Views")
plt.xlabel('Views per Day')
plt.yscale('log')

#### Interpretation:
- **Majority of Low Views**: A substantial portion of the data points fall into the lowest view categories. For example, over 175 items have very few views per day, and roughly 70 have slightly more (perhaps up to 100-150 views/day).

- **Decreasing Frequency**: As the number of views per day increases, the frequency of items with that many views rapidly decreases.

- **Rare High Performers**: Only a very small number of items achieve high view counts. For instance, there are only a handful of items with views between 500 and 1000 per day, and even fewer above 1000.

In [None]:
fig, axes = plt.subplots(1,2,figsize = (14,7))

# Boxplot of views by weekday
weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.boxplot(x='published_weekday', y='views_per_day', data=df, order = weekday, showfliers=True, width=0.5,ax=axes[0],
            boxprops=dict(facecolor='gray', edgecolor='black'),
            medianprops=dict(color='black'),
            whiskerprops=dict(color='black'),
            capprops=dict(color='black'))
axes[0].set_ylabel('Views per Day')
axes[0].set_yscale('log')
axes[0].tick_params(axis='x', rotation=45)
axes[0].set_xlabel('Published weekday')
axes[0].set_title("Views along Published Weekday")


# Barplot of the views distribution along weekday
sns.countplot(x='published_weekday', data=df, order=weekday, ax=axes[1])
for p in axes[1].patches:
    count = int(p.get_height())
    axes[1].text(p.get_x() + p.get_width() / 2,
                 p.get_height() + 1,
                 str(count),
                 ha='center', va='bottom', fontsize=9, fontweight='bold')

axes[1].set_ylabel('Total Videos')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_xlabel('Published Weekday')
axes[1].set_title("Total Videos along Published Weekday")

**Insights**: 
- Tuesday videos tend to perform more consistently well. 
- Monday, Wednesday, Thursday and Friday all have a high median.
- Saturday is consistently underperforming.
- Sunday performs better than Saturday, but still has a low median views.

In [None]:
fig, axes = plt.subplots(1,2,figsize = (14,7))

day_of_time=['Early Morning', 'Morning', 'Afternoon', 'Evening']

# Distribution of  views per day by time of day
sns.boxplot(x='published_time_of_day', y='views_per_day', data=df, order=day_of_time, width=0.4, ax=axes[0],
            boxprops=dict(facecolor='gray', edgecolor='black'),
            medianprops=dict(color='black'),
            whiskerprops=dict(color='black'),
            capprops=dict(color='black'))

axes[0].set_ylabel('Views per Day')
axes[0].set_yscale('log')
axes[0].tick_params(axis='x', rotation=45)
axes[0].set_xlabel('Published Day of Time')
axes[0].set_title("Views per Day along Published Day of Time")



# Barplot of the views distribution along weekday
sns.countplot(x='published_time_of_day', data=df, order=day_of_time, ax=axes[1])

for p in axes[1].patches:
    count = int(p.get_height())
    axes[1].text(p.get_x() + p.get_width() / 2,
                 p.get_height() + 1,
                 str(count),
                 ha='center', va='bottom', fontsize=9, fontweight='bold')
    


axes[1].set_ylabel('Total videos')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_xlabel('Published Day of Time')
axes[1].set_title("Total Videos along Published Day of Time")


In [None]:
fig, axes = plt.subplots(1,2,figsize = (14,7))

# Boxplot of views by weekday
weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.boxplot(x='published_weekday', y='likes_per_day', data=df, order = weekday, showfliers=True, width=0.5,ax=axes[0],
            boxprops=dict(facecolor='gray', edgecolor='black'),
            medianprops=dict(color='black'),
            whiskerprops=dict(color='black'),
            capprops=dict(color='black'))
axes[0].set_ylabel('Likes per Day')
axes[0].set_yscale('log')
axes[0].tick_params(axis='x', rotation=45)
axes[0].set_xlabel('Published weekday')
axes[0].set_title("Likes per Day along Published Weekday")


day_of_time=['Early Morning', 'Morning', 'Afternoon', 'Evening']

# Distribution of  views per day by time of day
sns.boxplot(x='published_time_of_day', y='likes_per_day', data=df, order=day_of_time, width=0.4, ax=axes[1],
            boxprops=dict(facecolor='gray', edgecolor='black'),
            medianprops=dict(color='black'),
            whiskerprops=dict(color='black'),
            capprops=dict(color='black'))

axes[1].set_ylabel('Likes per Day')
axes[1].set_yscale('log')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_xlabel('Published Day of Time')
axes[1].set_title("Likes per Day along Published Day of Time")

In [None]:
sns.histplot(df['duration_seconds'], bins=50) # type: ignore
plt.title("Distribution of Duration")
plt.xlabel('Duration in Seconds')

In [None]:
# Define duration buckets
fig, axes=plt.subplots()

df['duration_bucket'] = pd.cut(
    df['duration_seconds'],
    bins=[0, 600, 1800, 3600, 100000],
    labels=['Short (0-10 min)', 'Medium (10-30 min)', 'Long (30-60 min)', 'Very Long (60+ min)'],
    right=False
)

# Plot duration distribution
sns.countplot(x='duration_bucket', data=df, ax=axes)  # Use countplot for category frequencies

for p in axes.patches:
    count = int(p.get_height())
    axes.text(p.get_x() + p.get_width() / 2,
                 p.get_height() + 1,
                 str(count),
                 ha='center', va='bottom', fontsize=9, fontweight='bold')
    

plt.title("Distribution of Video Duration Categories")
plt.xlabel("Duration Category")
plt.ylabel("Number of Videos")
plt.xticks(rotation=15)
plt.tight_layout()

In [None]:
sns.barplot(x='duration_bucket', y='views_per_day', data=df, estimator='mean')
plt.title("Average Views per Day by Video Duration")
plt.xlabel("Duration Category")
plt.xticks(rotation=15)


In [None]:
# Like ratio vs duration
sns.boxplot(x='duration_bucket', y='views_per_day', data=df, width=0.4, 
            boxprops=dict(facecolor='gray', edgecolor='black'),
            medianprops=dict(color='black'),
            whiskerprops=dict(color='black'),
            capprops=dict(color='black'))
plt.yscale('log')
plt.xlabel('Duration Categories')
plt.ylabel('Views per Day')
plt.xticks(rotation=15)
plt.title("Video Duration vs Views per Day")

In [None]:
sns.barplot(x='duration_bucket', y='likes_per_day', data=df, estimator='mean')
plt.title("Average Likes per Day by Video Duration")
plt.xlabel("Duration Category")
plt.xticks(rotation=15)

In [None]:
sns.boxplot(x='duration_bucket', y='likes_per_day', data=df, width=0.4, 
            boxprops=dict(facecolor='gray', edgecolor='black'),
            medianprops=dict(color='black'),
            whiskerprops=dict(color='black'),
            capprops=dict(color='black'))
plt.yscale('log')
plt.xlabel('Duration Categories')
plt.ylabel('Likes per Day')
plt.xticks(rotation=15)
plt.title("Video Duration vs Likes per Day")

In [None]:
fig, axes = plt.subplots(2,1,figsize=(10, 12))

top_tags_views = tag_stats_filtered.sort_values('views_per_day', ascending=False).head(10)

sns.barplot(data=top_tags_views, x='views_per_day', y='tag', palette='viridis', ax = axes[0])
axes[0].set_title("Top Tags by Average Views per Day")
axes[0].set_xlabel("Views per Day")
axes[0].set_ylabel("Tag")



top_tags_likes = tag_stats_filtered.sort_values('likes_per_day', ascending=False).head(10)
sns.barplot(data=top_tags_likes, x='views_per_day', y='tag', palette='viridis',  ax = axes[1])
axes[1].set_title("Top Tags by Average Likes per Day")
axes[1].set_xlabel("Likes per Day")
axes[1].set_ylabel("Tag")
