In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df=pd.read_csv('./Dataset/youtube_data_large.csv')

In [3]:
df

Unnamed: 0,Video_ID,Title,Upload_Date,Duration,Views,Likes,Dislikes,Comments,Tags,Category,Subscribers_Gained
0,VID001,Daughter cut deal address.,2025-03-21 20:00:21,2996,30122,1521,71,1439,"tutorial,study,funny",Entertainment,52
1,VID002,Pattern fall garden leg.,2025-02-18 08:16:55,290,22336,1840,39,1003,"javascript,vlog,coding,tutorial,motivation",Gaming,98
2,VID003,Offer same.,2025-02-12 04:51:13,2070,23973,770,58,1188,"tutorial,react,vlog",Vlogs,74
3,VID004,Create first service.,2025-02-14 09:04:08,1586,3128,260,6,54,"tutorial,javascript,review,react",Vlogs,77
4,VID005,Against agency around.,2025-02-16 01:19:07,3511,43460,2369,150,636,"coding,tutorial,motivation,review",Lifestyle,87
...,...,...,...,...,...,...,...,...,...,...,...
145,VID146,International measure.,2025-02-14 06:35:10,1214,35097,2149,134,535,"python,react,tutorial",Lifestyle,71
146,VID147,Trouble production.,2025-02-01 19:14:29,2508,49814,1611,66,2001,"tutorial,python,review",Education,2
147,VID148,Memory stock assume speech body.,2025-03-08 10:02:37,1290,675,24,1,22,"productivity,react,coding,vlog,travel",Entertainment,17
148,VID149,Case pattern you bill.,2025-03-23 08:27:13,1785,24850,1735,88,1046,"javascript,funny",Lifestyle,18


In [4]:
df['Upload_Date'] = pd.to_datetime(df['Upload_Date'])


2. Distribution Plots

In [5]:
#a. Views Distribution

fig = px.histogram(df, x='Views', nbins=40, title='Distribution of Video Views',
                   color_discrete_sequence=['#00CC96'])
fig.show()


In [6]:
#b. Likes Distribution

fig = px.histogram(df, x='Likes', nbins=40, title='Distribution of Video Likes',
                   color_discrete_sequence=['#636EFA'])
fig.show()


In [7]:
#c. Video Duration Distribution

fig = px.histogram(df, x='Duration', nbins=40, title='Distribution of Video Duration (in seconds)',
                   color_discrete_sequence=['#EF553B'])
fig.show()


In [8]:
#3. Scatter Plot: Duration vs Views

fig = px.scatter(df, x='Duration', y='Views', 
                 hover_data=['Title', 'Category', 'Upload_Date'],
                 title='Video Duration vs Views',
                 size='Likes', color='Category')
fig.show()


 4. Engagement by Category

In [9]:
#a. Average Views by Category

avg_views = df.groupby('Category')['Views'].mean().reset_index().sort_values(by='Views', ascending=False)
fig = px.bar(avg_views, x='Category', y='Views', 
             title='Average Views by Video Category',
             color='Views')
fig.show()


In [10]:
#b. Average Likes by Category

avg_likes = df.groupby('Category')['Likes'].mean().reset_index().sort_values(by='Likes', ascending=False)
fig = px.bar(avg_likes, x='Category', y='Likes', 
             title='Average Likes by Video Category',
             color='Likes')
fig.show()



In [11]:
#5 Subscribers Gained vs Views

fig = px.scatter(df, x='Views', y='Subscribers_Gained', 
                 hover_data=['Title', 'Category'],
                 color='Category', title='Views vs Subscribers Gained')
fig.show()


In [12]:
# 6. Tag Analysis (Optional Text Processing)

from collections import Counter

all_tags = df['Tags'].dropna().str.split(',').sum()
tag_counts = Counter(all_tags)

# Convert to DataFrame
tag_df = pd.DataFrame(tag_counts.items(), columns=['Tag', 'Count']).sort_values(by='Count', ascending=False)

# Plot
fig = px.bar(tag_df.head(20), x='Tag', y='Count', title='Top 20 Most Common Tags')
fig.show()


In [13]:
#7. Views Over Time

df['Upload_Date'] = pd.to_datetime(df['Upload_Date'])
df['Upload_Month'] = df['Upload_Date'].dt.to_period('M').astype(str)

monthly_views = df.groupby('Upload_Month')['Views'].sum().reset_index()

fig = px.line(monthly_views, x='Upload_Month', y='Views', title='Total Views by Upload Month')
fig.show()


In [14]:
| Feature              | Description                                 |
| -------------------- | ------------------------------------------- |
| `Duration`           | Video length (seconds)                      |
| `Views`              | Total number of video views                 |
| `Likes`              | Number of likes received                    |
| `Dislikes`           | Number of dislikes                          |
| `Comments`           | Number of comments                          |
| `Tags`               | Comma-separated list of topics/keywords     |
| `Category`           | YouTube-defined video category              |
| `Subscribers_Gained` | Number of subscribers gained from the video |


SyntaxError: invalid syntax (2523100236.py, line 1)

HTML plots for hovering 

In [15]:
# all plots in single html file

import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
from PIL import Image
import io
import plotly.io as pio
from plotly.subplots import make_subplots
from plotly.graph_objs import Figure
import os


In [16]:

# --- Plot 1: Views Distribution ---
fig1= px.histogram(df, x='Views', nbins=40, title='Distribution of Video Views',
                   color_discrete_sequence=['#00CC96'])

# --- Plot 2: Likes Distribution ---
fig2= px.histogram(df, x='Likes', nbins=40, title='Distribution of Video Likes',
                   color_discrete_sequence=['#636EFA'])

# --- Plot 3: Video Duration Distribution ---

fig3= px.histogram(df, x='Duration', nbins=40, title='Distribution of Video Duration (in seconds)',
                   color_discrete_sequence=['#EF553B'])

# --- Plot 4: Scatter Plot: Duration vs Views ---
fig4= px.scatter(df, x='Duration', y='Views', 
                 hover_data=['Title', 'Category', 'Upload_Date'],
                 title='Video Duration vs Views',
                 size='Likes', color='Category')

# --- Plot 5: Average Views by Category ---
avg_views = df.groupby('Category')['Views'].mean().reset_index().sort_values(by='Views', ascending=False)
fig5= px.bar(avg_views, x='Category', y='Views', 
             title='Average Views by Video Category',
             color='Views')

# --- Plot 6: Average Likes by Category ---
avg_likes = df.groupby('Category')['Likes'].mean().reset_index().sort_values(by='Likes', ascending=False)
fig6= px.bar(avg_likes, x='Category', y='Likes', 
             title='Average Likes by Video Category',
             color='Likes')

# --- Plot 7: Subscribers Gained vs Views ---
fig7= px.scatter(df, x='Views', y='Subscribers_Gained', 
                 hover_data=['Title', 'Category'],
                 color='Category', title='Views vs Subscribers Gained')

# --- Plot 8: Tag Analysis (Optional Text Processing) ---

from collections import Counter

all_tags = df['Tags'].dropna().str.split(',').sum()
tag_counts = Counter(all_tags)

# Convert to DataFrame
tag_df = pd.DataFrame(tag_counts.items(), columns=['Tag', 'Count']).sort_values(by='Count', ascending=False)

# Plot
fig8= px.bar(tag_df.head(20), x='Tag', y='Count', title='Top 20 Most Common Tags')


#Plot 9 -- Views Over Time --

df['Upload_Date'] = pd.to_datetime(df['Upload_Date'])
df['Upload_Month'] = df['Upload_Date'].dt.to_period('M').astype(str)

monthly_views = df.groupby('Upload_Month')['Views'].sum().reset_index()

fig9= px.line(monthly_views, x='Upload_Month', y='Views', title='Total Views by Upload Month')

# Save all plots in a single HTML file
#html_path = "Dataset/netflix_analysis_all_plots.html"

#pio.write_html([fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8, fig9], file=html_path, auto_open=False)

#html_path


In [18]:
import plotly.io as pio

# List of all your figures
figures = [fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8]

# Generate HTML strings for each figure
html_parts = [pio.to_html(fig, full_html=False, include_plotlyjs='cdn') for fig in figures]

# Combine them into one full HTML document
full_html = f"""
<html>
<head>
    <title>Youtube Analysis</title>
</head>
<body>
    {''.join(html_parts)}
</body>
</html>
"""

# Save to file
output_path = "Youtube_analysis_all_plots.html"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(full_html)

print(f"Saved to {output_path}")


Saved to Youtube_analysis_all_plots.html
