In [3]:
from datetime import datetime
import pandas as pd
import plotly.express as px

from get_video_data import *

In [4]:
# load the preprocessed dataset
# remember to rename the first column to 'idx'
agustSeptemberYoutubeData = pd.read_csv('agustSeptemberYoutubeData.csv', sep=';', index_col='idx')

In [5]:
daily_spent_time = agustSeptemberYoutubeData.groupby('Date')['Duration'].sum().reset_index()
# transform duration to hours
daily_spent_time['Duration'] = daily_spent_time['Duration'].apply(lambda x: x/60)
date_format = '%d/%m/%Y'
# convert the Date column to datetime 
daily_spent_time['Date'] = pd.to_datetime(daily_spent_time['Date'], format=date_format) 
# Sort the dataframe by date ascending order
daily_spent_time = daily_spent_time.sort_values(by='Date')
daily_spent_time.head()

Unnamed: 0,Date,Duration
0,2023-08-01,18.590833
2,2023-08-02,1.656667
4,2023-08-03,6.524167
6,2023-08-04,20.669444
8,2023-08-05,7.16


In [6]:
# Create an interactive line chart with date on the x-axis and total time on the y-axis
fig = px.line(daily_spent_time, x='Date', y='Duration', title='Total Time Spent Watching YouTube (Aug-Sep 2023)')
fig.update_traces(mode='markers+lines', hovertemplate='%{x|%Y-%m-%d}<br>Total Time: %{y:%H:%M:%S}<extra></extra>')

# Customize the chart layout for improved readability
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Total Time Spent In Hours",
    hovermode="x unified",  # Display hover information at the nearest data point
    xaxis=dict(showline=True, showgrid=False, tickmode='auto', nticks=10),  # Adjust tick frequency
    yaxis=dict(showline=True, showgrid=False),
    margin=dict(l=20, r=20, t=60, b=20),  # Add margin for better label visibility
)

# Show the chart
fig.show()

In [7]:
# Create a bar plot
fig = px.bar(daily_spent_time, x='Date', y='Duration', title='Total Time Spent Watching YouTube (Aug-Sep 2023)')
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Total Time Spent In Hours')

fig.show()

In [8]:
# calculate the average number of hours spent watching youtube videos
average_hours = daily_spent_time['Duration'].mean()
average_hours

5.841910430839001

In [9]:
# Calculate the number of videos watched by day
daily_number_watched_videos = agustSeptemberYoutubeData['Date'].value_counts().to_frame().reset_index()
daily_number_watched_videos.columns = ['Date', 'nb_videos']
daily_number_watched_videos = daily_number_watched_videos.sort_values(by='Date')
daily_number_watched_videos.head()

Unnamed: 0,Date,nb_videos
22,01/08/2023,21
40,01/09/2023,8
29,02/08/2023,15
41,02/09/2023,8
20,03/08/2023,22


In [10]:
fig = px.bar(daily_number_watched_videos, x='Date', y='nb_videos', title='Number of Videos Watched by Day')
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Number of Videos Watched')
fig.show()

In [11]:
# Calculate the average of the number of videos watched by day
average_nb_videos = daily_number_watched_videos['nb_videos'].mean()
average_nb_videos

23.571428571428573

In [12]:
# Add Day column (name of the day) from the date
daily_spent_time['Day'] = daily_spent_time['Date'].apply(lambda date: date.strftime('%A'))

In [13]:
# during what days i was watching the most content on youtube ?
most_watching_days = daily_spent_time.groupby('Day')['Duration'].sum().reset_index()
most_watching_days = most_watching_days.sort_values(by='Duration', ascending=False)
most_watching_days

Unnamed: 0,Day,Duration
0,Friday,62.563056
3,Sunday,47.883889
5,Tuesday,42.248056
1,Monday,40.411111
4,Thursday,38.51
6,Wednesday,29.534444
2,Saturday,25.103056


In [14]:
fig = px.bar(most_watching_days, x='Day', y='Duration', title='Most watching YouTube videos days')
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Number of watching hours')
fig.show()

In [127]:
def take_first_list_element(string_list):
    string_list = str(string_list)
    pattern = r"'(.*?)'"
    match = re.search(pattern, string_list)
    if match:
        topic =  match.group(1)
        if topic == 'Knowledge' or topic == 'Technology':
            return "Knowledge/Technology"
        if 'music' in topic.lower():
            return 'Music'
        else:
            return topic
    else:
        return string_list

count_topic_watched_videos = agustSeptemberYoutubeData['Topics'].to_frame()
count_topic_watched_videos['Topics'] = count_topic_watched_videos['Topics'].apply(lambda x: take_first_list_element(x))
count_topic_watched_videos = count_topic_watched_videos.value_counts().to_frame().reset_index()
count_topic_watched_videos.columns = ['Topics', 'nb_videos']

# count_topic_watched_videos

In [128]:
fig = px.bar(count_topic_watched_videos, x='Topics', y='nb_videos', title='Most watching topics on YouTube')
fig.update_xaxes(title_text='Topics')
fig.update_yaxes(title_text='Number of watched videos')
fig.show()

In [130]:
# to get the topic of the videos that lack a predefined category using the youtube API, 
# i'll use the pretrained model BERTopic. this model will be feed with both the title and the tags of the video
from bertopic import BERTopic

In [138]:
empty_topic_data = agustSeptemberYoutubeData[agustSeptemberYoutubeData['Topics'] == '[]'][['URL', 'Title', 'Tags']]
empty_topic_data['Tokens'] = empty_topic_data['Title'] + empty_topic_data['Tags']
empty_topic_data = empty_topic_data.loc[:, ['URL', 'Tokens']]

In [141]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [144]:
docs = empty_topic_data['Tokens'].to_list()
topics, probs = topic_model.fit_transform(docs)

In [145]:
# visualize the topics generated by the model in the form of a nice bar chart.
topic_model.visualize_barchart()

- Topic 0 ==> AD
- Topic 1 ==> AD
- Topic 2 ==> Knowledge / Technology
- Topic 3 ==> Entertainment
- Topic 4 ==> AD
- Topic 5 ==> Entertainment
- Topic 6 ==> AD
- Topic 7 ==> AD

In [146]:
def predict_topic(doc): 
    
    doc = str(doc)
    # preprocessed_new_string = [doc.lower()]  # we can apply more preprocessing steps here

    predicted_topic, _ = topic_model.transform(doc)

    return predicted_topic[0]

In [147]:
empty_topic_data['Topic'] = empty_topic_data['Tokens'].apply(lambda x: predict_topic(x))

In [148]:
empty_topic_data.to_csv('empty_topic_data.csv', sep=';')

In [174]:
# rename the topics
AD_topics = [-1, 0, 1, 4, 6, 7]
Entertainment_topics = [3, 5]
empty_topic_data['Topic'] = empty_topic_data['Topic'].apply(lambda x: 'AD' if x in AD_topics 
                                                            else ('Entertainment' if x in Entertainment_topics
                                                                   else ('Knowledge/Technology' if x==2 else x)))

In [185]:
# Check if 'Topics' column in agustSeptemberYoutubeData df is empty and replace it with values from empty_topic_data['Topic']
agustSeptemberYoutubeData['Topics'] = agustSeptemberYoutubeData.apply(lambda row: empty_topic_data.loc[empty_topic_data['URL'] == row['URL'], 'Topic'].values[0] 
                           if row['Topics'] == '[]' else take_first_list_element(row['Topics']), axis=1)


In [187]:
count_topic_watched_videos = agustSeptemberYoutubeData['Topics'].to_frame()
count_topic_watched_videos['Topics'] = count_topic_watched_videos['Topics'].apply(lambda x: take_first_list_element(x))
count_topic_watched_videos = count_topic_watched_videos.value_counts().to_frame().reset_index()
count_topic_watched_videos.columns = ['Topics', 'nb_videos']

In [189]:
fig = px.bar(count_topic_watched_videos, x='Topics', y='nb_videos', title='Most watching topics on YouTube')
fig.update_xaxes(title_text='Topics')
fig.update_yaxes(title_text='Number of watched videos')
fig.show()

In [192]:
agustSeptemberYoutubeData.to_csv('finalAugustSeptData.csv', sep=';')