In [None]:
# Dependencies
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import scipy.stats as sts

In [None]:
# read CSV into dataframe "df"
csv_path = "Resources/videos.csv"
df = pd.read_csv(csv_path)
# df.head(10)

In [None]:
# DATA CLEAN UP

# put publish date and publish time into new columns
df[["publish date", "asdf"]] = df["publish_time"].str.split(pat="T", n=1, expand=True)


df[["publish time", "trash"]] = df["asdf"].str.split(pat=".", n=1, expand=True)

# change publish date into standard "mm/dd/yyyy" format
df["publish date"] = pd.to_datetime(df["publish date"])
df["publish_date"] = df["publish date"].dt.strftime('%m/%d/%Y')

# change trending date into standard "mm/dd/yyyy" format
df["trending_date"]=pd.to_datetime(df["trending_date"].astype(str), format="%y.%d.%m")
df["trending_date"] = df["trending_date"].dt.strftime("%m/%d/%Y")

In [None]:
# along with the CSV, we were given a JSON to translate the category number into the category title
# first, read in JSON and create empty dictionary for the for loop
category_json_path = "Resources/US_category_id.json"
category_dict = {'category_id': [], 'Category': []}
indexer = 0

with open(category_json_path) as json_data:
  data = json.load(json_data)

items = data['items']
# items

# adding the category title into new column for each video
for item in items:
    
    category_dict['category_id'].append(data['items'][indexer]['id'])
    category_dict['Category'].append(data['items'][indexer]['snippet']['title'])
    indexer = indexer + 1

In [None]:
# merge "data" and "df" DataFrames to add the category title column to our main dataframe
category_df = pd.DataFrame(category_dict)
category_df["category_id"] = category_df["category_id"].astype(int)
df = df.merge(category_df, how='inner', on='category_id')

In [None]:
# final clean up step!
# include columns that we will work with in the order that makes most sense
df = df[["video_id", "title", "channel_title", "Category", "publish time", "publish_date", "trending_date", "views", "likes", "dislikes", "tags", "comments_disabled"]]
df.head()

In [None]:
# data frame with rows for last trending date
# we want this separate data frame where we want the total view count at the last date a video goes trending
last_df = df[['video_id', 'trending_date']]
last_df = last_df.groupby('video_id').max()
last_df = last_df.reset_index()
last_df = last_df.merge(df, how ='left')
# last_df.head()

In [None]:
# data frame with rows for first trending date
# we want this separate data frame for calculating the time between publish date and first trending date
first_df = df[['video_id', 'trending_date']]
first_df = first_df.groupby('video_id').min()
first_df = first_df.reset_index()
first_df = first_df.merge(df, how ='left')
# first_df.head()

In [None]:
# Mike's Code

In [None]:
# popular tags on YouTube videos

# need to split the tags for each video and put into a new DataFrame in order to graph the bar chart
# empty lists to store video_id & tags within for loops below
tags_ = []
ids = []

# put tags into new list, split on "|"
for _, row in first_df.iterrows():
    current_tags = row["tags"].split("|")
    
    # for every video, all tags except the first one have quotation marks
    # remove quotes by slicing the first and last character of each entry, starting with the 2nd one (index=1)
    for i in range(1, len(current_tags)):
        current_tags[i] = current_tags[i][1:-1]
    
    # push the clean entries into each list
    for tag in current_tags:
        tags_.append(tag)
        ids.append(row["video_id"])

# create new dataframe with the clean entries, using video_id as the index
video_tags = pd.DataFrame({"video_id": ids, "tags": tags_})

# count of unique tags for YouTube videos in this timeframe
video_tags["tags"].nunique()

# plotting the bar chart
# first groupby "tags" so we can count each time a tag is used
video_group = video_tags.groupby("tags")
tags_count = video_group["tags"].count()

# sort the tags by count and save the top 20 tags into series, covert into datafram for plotting
top_20_tags = tags_count.sort_values().tail(20)
top_20_tags = pd.DataFrame(top_20_tags)

# plot the bar chart and customize output for easy viewing
fig = top_20_tags.plot(kind="bar", legend=False, figsize=(15,8), fontsize=14)
fig.set_ylabel("Count", fontsize=20)
fig.set_xlabel("Video Tags", fontsize=20)
fig.set_title("Top 20 Popular Tags", fontsize=20)

In [None]:
## Mike's Code

In [None]:
# Riley's Code

In [None]:
# Data Visualization #2: Videos trending by publish time

# Split the Publish time column by the hour, convert to integer
publish_df = last_df
publish_df[["publish_hour", "publish_min_sec"]] = publish_df["publish time"].str.split(":", n=1, expand=True)
publish_df['publish_hour'] = publish_df['publish_hour'].astype(int)

# New DF grouped by the publish hour
publish_group_df = publish_df.groupby('publish_hour').count()
publish_group_df = publish_group_df['video_id']


# Plot number of trending videos by publish time
publish_group_df.plot(kind='bar', rot = 0)
plt.xlabel('Time of Publish (h)')
plt.ylabel('# of Trending Videos')
plt.title('Youtube Trending Videos vs Publish Time')
plt.show()

In [None]:
# Data visualization #7: How long does it take videos to go "trending"?

# Creating a summary table for time to trend
trend_df = first_df

# Convert date strings to datetime type
trend_df['trending_date'] = pd.to_datetime(trend_df['trending_date'])
trend_df['publish_date'] = pd.to_datetime(trend_df['publish_date'])

# Calculate days it took the video to trend, put in column
trend_df['Days to Trend'] = trend_df['trending_date'] - trend_df['publish_date']

# Find the maximum of the dataset
max_trend = str(trend_df['Days to Trend'].max())
max_trend_list = max_trend.split(' ')
max_trend_video = trend_df.iloc[trend_df['Days to Trend'].idxmax(),2]

# Find the minimum of the dataset
min_trend = str(trend_df['Days to Trend'].min())
min_trend_list = min_trend.split(' ')

# Find the mean of the dataset
mean_trend = str(trend_df['Days to Trend'].mean().round('d'))
mean_trend_list = mean_trend.split(' ')

# Find the median of the dataset
med_trend = str(trend_df['Days to Trend'].median())
med_trend_list = med_trend.split(' ')

# Find the standard deviation of the dataset
std_trend = str(trend_df['Days to Trend'].std().round('d'))
std_trend_list = std_trend.split(' ')

# Put summary stats into DataFrame
trending_summary_df = pd.DataFrame({'':['Maximum', 'Minimum', 'Mean', 'Median', 'Standard Deviation'],
                                    'Days': [max_trend_list[0], min_trend_list[0], mean_trend_list[0], med_trend_list[0], std_trend_list[0]]
                                   })

# Convert 'Days to Trend' column from timedelta to integer 
trend_df['Days to Trend'] = (trend_df['Days to Trend'] / np.timedelta64(1, 'D')).astype(int)

# Create boxplot showing days to trend
plt.boxplot(trend_df['Days to Trend'], showfliers=False)
plt.ylabel('Number of Days')
plt.title('Number of Days for Youtube Videos to go "Trending"')
plt.show()

# Print string about longest video
print(f'The longest a video has been on Youtube before it hit the "Trending" page, was \033[1m {max_trend_video} \033[0m at {max_trend_list[0]} days!')
print(f'That is over \033[1m11 years!\033[0m')

# Show summary table
trending_summary_df

In [None]:
## Riley's Code

In [None]:
## Jackie's Code

In [None]:
## VISUALIZATION 3: How many videos have gone trending per Category? (Bar Chart)
last_df['Category'].value_counts().sort_values().plot(kind='barh', figsize=(10,7))
plt.title("Number of trending videos by category", size=15)
plt.show()

# Count number of trending videos in each category
category_count = last_df['Category'].value_counts(sort=True, ascending=False)
category_count

In [None]:
## VISUALIZATION 1: How do views and dis/likes correlate? (Linear Regression)
# Define function for creating linear regression and scatter plot
from scipy.stats import linregress
import scipy.stats as st
import seaborn as sns

# Plot number of likes to views
x_values = last_df["views"]
y_values = last_df["likes"]

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(1.5, 2), color='red', fontsize=14)
plt.xlabel("Views")
plt.ylabel("Likes")
plt.show()
print(f"Correlation between these two variables is {round(rvalue,2)}")
--------------------------------------------------------------------------------------------------------------------------------
## VISUALIZATION 1: How do views and dis/likes correlate? (Linear Regression)
# Plot number of dislikes to views
x_values = last_df["views"]
y_values = last_df["dislikes"]

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(0, 0), color='red', fontsize=14)
plt.xlabel("Views")
plt.ylabel("Dislikes")
plt.show()
print(f"Correlation between these two variables is {round(rvalue,2)}")
--------------------------------------------------------------------------------------------------------------------------------
## VISUALIZATION 1: How do views and dislikes correlate? (correlation matrix)
heatmap_columns = ['views', 'likes', 'dislikes']
h_labels = ['Views', 'Likes', 'Dislikes']
fig, ax = plt.subplots(figsize=(14,8))
sns.heatmap(last_df[heatmap_columns].corr(), annot=True, xticklabels=h_labels, yticklabels=h_labels)

plt.tight_layout()

In [None]:
## Jackie's Code

In [None]:
# Brian's Code

In [None]:
# Generate a bar plot showing the top 20 youtube channels with most trending videos.

# groupby video_id using count and channel_title
vids_per_title_df = last_df[["video_id", "channel_title"]]
# vids_per_title_df.head()

In [None]:
vids_per_title_df = vids_per_title_df.groupby("channel_title").count()
# len(vids_per_title_df)
vids_per_title_df.head()

In [None]:
vids_per_title_df = vids_per_title_df.sort_values(by = 'video_id', ascending = False)
# vids_per_title_df.head()

# plot a bar chart
# vids_per_title_df.head(20).plot(kind="bar", figsize=(12,8), color = "b", legend=False)
vids_per_title_df.head(20).plot(kind="bar", figsize=(12,8), color = [plt.cm.Paired(np.arange(len(vids_per_title_df)))], legend=False)

# title
plt.title("Top Twenty YouTube Channels With Most Trending Videos")
plt.xticks(rotation=45, horizontalalignment="right")
plt.xlabel("YouTube Channels")
plt.ylabel("Number of Trending Videos")

plt.show()
plt.tight_layout()

In [None]:
#Create 2 boxplots comparing the results of average Youtube video views with comments disabled and comments enabled.

comments_off_df = last_df.loc[last_df["comments_disabled"] == True, ["views", "comments_disabled"]]
# comments_off_df

comments_on_df = last_df.loc[last_df["comments_disabled"] == False, ["views", "comments_disabled"]]
# comments_on_df

fig1, ax1 = plt.subplots()
ax1.boxplot([comments_off_df['views'], comments_on_df['views']], showfliers = False)

# title and labels
plt.title('AVG VIEWS WITH COMMENTS DISABLED VS. ENABLED')
plt.ylabel('AVG VIEWS')
plt.xticks([1, 2], ["Comments Disabled","Comments Enabled"])

plt.show()

In [None]:
## Brian's code