# Import libraries and dataset

In [1]:
from pyspark.sql import * 
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .appName("Analysis") \
    .getOrCreate()

In [3]:
df = spark.read.csv(
    './data/processed_data.csv', header=True, inferSchema=True,
    quote='"' , escape ='"', multiLine=True)

# S·∫Øp x·∫øp video theo th·ª© t·ª± view t·ª´ th·∫•p ƒë·∫øn cao

In [4]:
df.show()

+---+-------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+-------+--------+-------------+--------------------+
|_c0|trending_date|               title|       channel_title|    category_id|       publish_time|                tags|   views|  likes|dislikes|comment_count|         description|
+---+-------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+-------+--------+-------------+--------------------+
|  0|   2017-11-14|John Lewis Christ...|          John Lewis|  Howto & Style|2017-11-10 07:38:29|['christmas', 'jo...| 7224515|  55681|   10247|         9479|Click here to con...|
|  1|   2017-11-14|Taylor Swift: ‚Ä¶Re...| Saturday Night Live|  Entertainment|2017-11-12 06:24:44|['SNL', 'Saturday...| 1053632|  25561|    2294|         2757|Musical guest Tay...|
|  2|   2017-11-14|Eminem - Walk On ...|          EminemVEVO|          Music|2017-11-10 17:00:03|[

In [5]:
df = df.orderBy('views', asc=False)

In [6]:
df.show()

+----+-------------+--------------------+--------------------+---------------+-------------------+--------------------+-----+-----+--------+-------------+--------------------+
| _c0|trending_date|               title|       channel_title|    category_id|       publish_time|                tags|views|likes|dislikes|comment_count|         description|
+----+-------------+--------------------+--------------------+---------------+-------------------+--------------------+-----+-----+--------+-------------+--------------------+
|3477|   2017-12-01|Mountain Bikers W...|    That's Surrey TV|News & Politics|2017-11-27 18:08:49|['Surrey', "That'...|  851|   12|       0|            3|A group of mounta...|
|3701|   2017-12-02|Mountain Bikers W...|    That's Surrey TV|News & Politics|2017-11-27 18:08:49|['Surrey', "That'...|  890|   13|       0|            3|A group of mounta...|
|3908|   2017-12-03|Mountain Bikers W...|    That's Surrey TV|News & Politics|2017-11-27 18:08:49|['Surrey', "That'...| 

In [7]:
windowSpec = Window.partitionBy("title").orderBy(col("views").desc())
windowSpec

<pyspark.sql.window.WindowSpec at 0x25b84c55e80>

In [8]:
view_df = df.withColumn("view_rank", row_number().over(windowSpec))
view_df.show()

+-----+-------------+--------------------+-----------------+--------------+-------------------+--------------------+------+-----+--------+-------------+--------------------+---------+
|  _c0|trending_date|               title|    channel_title|   category_id|       publish_time|                tags| views|likes|dislikes|comment_count|         description|view_rank|
+-----+-------------+--------------------+-----------------+--------------+-------------------+--------------------+------+-----+--------+-------------+--------------------+---------+
|  567|   2017-11-16|#21 How to go FAS...|       Ben Cathro|        Sports|2017-11-05 20:10:16|['Ben Cathro', 's...| 16074|  689|       8|          142|How flipping rad ...|        1|
|  358|   2017-11-15|#21 How to go FAS...|       Ben Cathro|        Sports|2017-11-05 20:10:16|['Ben Cathro', 's...| 15818|  686|       8|          141|How flipping rad ...|        2|
|  146|   2017-11-14|#21 How to go FAS...|       Ben Cathro|        Sports|2017-

In [9]:
view_df = view_df.filter(col('view_rank') == 1)
view_df = view_df.drop(view_df.view_rank)

In [10]:
view_df.show()

+-----+-------------+------------------------+--------------------+----------------+-------------------+--------------------+--------+------+--------+-------------+--------------------+
|  _c0|trending_date|                   title|       channel_title|     category_id|       publish_time|                tags|   views| likes|dislikes|comment_count|         description|
+-----+-------------+------------------------+--------------------+----------------+-------------------+--------------------+--------+------+--------+-------------+--------------------+
|  567|   2017-11-16|    #21 How to go FAS...|          Ben Cathro|          Sports|2017-11-05 20:10:16|['Ben Cathro', 's...|   16074|   689|       8|          142|How flipping rad ...|
| 1791|   2017-11-22|    #VeteransDay: Tha...|   YouTube Spotlight|   Entertainment|2017-11-10 15:07:13|['vets', 'veteran...|  916104| 27405|    2153|         5292|This #VeteransDay...|
|38334|   2018-06-12|    13 Reasons Why: S...|             Netflix|   

In [11]:
view_df.select(sum('views')).show()

+-----------+
| sum(views)|
+-----------+
|15915944060|
+-----------+



# Video ƒë∆∞·ª£c xem nhi·ªÅu nh·∫•t ·ªü m·ªói h·∫°ng m·ª•c

In [16]:
highest_views_per_category = view_df.groupBy("category_id").agg(max("views").alias("max_view"))


In [17]:
result_df = view_df.join(
    highest_views_per_category,
    (view_df["category_id"] == highest_views_per_category["category_id"]) & 
    (view_df["views"] == highest_views_per_category["max_view"])
).select(view_df["*"])

In [18]:
result_df.show()

+-----+-------------+--------------------+--------------------+--------------------+-------------------+--------------------+---------+-------+--------+-------------+--------------------+
|  _c0|trending_date|               title|       channel_title|         category_id|       publish_time|                tags|    views|  likes|dislikes|comment_count|         description|
+-----+-------------+--------------------+--------------------+--------------------+-------------------+--------------------+---------+-------+--------+-------------+--------------------+
|36796|   2018-06-02|Dogs Tested to Se...|      Inside Edition|     News & Politics|2018-04-26 21:23:37|['burglar', 'dog ...| 18994966| 259800|   11070|        43269|If a masked intru...|
|25495|   2018-03-23|Incredibles 2 - O...|        Disney‚Ä¢Pixar|    Film & Animation|2018-02-15 03:34:44|['Pixar', 'Disney...| 42560014| 461135|   22464|        50255|The Incredibles a...|
|32250|   2018-05-05|Tyler, The Creato...|  Tyler, The Cre

# Most interacted videos per category

## Most interacted videos

In [27]:
view_df = view_df.withColumn('total_reactions', col('likes')+col('dislikes')+col('comment_count'))
highest_interaction = view_df.groupBy('title').agg(max('total_reactions').alias('total_reactions'))
highest_interaction = highest_interaction.orderBy(col("total_reactions").desc())

In [28]:
highest_interaction.show()

+-------------------------+---------------+
|                    title|total_reactions|
+-------------------------+---------------+
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'FAKE...|        7049374|
|     Childish Gambino ...|        6377774|
|     YouTube Rewind: T...|        5911375|
|       Drake - God‚Äôs Plan|        5156827|
|                So Sorry.|        5143583|
|     Bad Bunny - Amorf...|        4264625|
|     Ariana Grande - N...|        3804136|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'MIC ...|        3574348|
|     j-hope 'Daydream ...|        3178754|
|     Marvel Studios' A...|        3126405|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) LOVE ...|        3102006|
|     Nicky Jam x J. Ba...|        3067426|
|     Luis Fonsi, Demi ...|        2968324|
|     Maroon 5 - Girls ...|        2876711|
|     Te Bote Remix - C...|        2862074|
|     Ed Sheeran - Perf...|        2747840|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'Euph...|        2505493|
|     Bruno Mars - Fine...|        2492881|
|     Taylor Swift - De...|        2470323|
|     Tayl

### Plot total reactions graph

In [38]:
import pyspark.pandas as ps



In [None]:
interaction_df = highest_interaction.pandas_api()
interaction_df.plot.bar( y='total_reactions', )

## Most liked videos

In [30]:
highest_like = view_df.groupBy('title').agg(max('likes').alias('likes'))
highest_like = highest_like.orderBy(col("likes").desc())

In [31]:
highest_like.show()

+-------------------------+-------+
|                    title|  likes|
+-------------------------+-------+
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'FAKE...|5613827|
|     Childish Gambino ...|5444541|
|       Drake - God‚Äôs Plan|4737873|
|     Bad Bunny - Amorf...|3823879|
|     Ariana Grande - N...|3394437|
|     YouTube Rewind: T...|3312868|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'MIC ...|2960250|
|     Nicky Jam x J. Ba...|2818771|
|     Marvel Studios' A...|2701353|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) LOVE ...|2700801|
|     Luis Fonsi, Demi ...|2686169|
|     Maroon 5 - Girls ...|2677256|
|     j-hope 'Daydream ...|2672433|
|     Ed Sheeran - Perf...|2584773|
|     Te Bote Remix - C...|2581961|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'Euph...|2250087|
|     Bruno Mars - Fine...|2248693|
|     Taylor Swift - De...|2161260|
|     Taylor Swift - En...|2153475|
|     j-hope 'Airplane' MV|1841936|
+-------------------------+-------+
only showing top 20 rows



### Plot

In [46]:
pandas_total_like_df = highest_like.pandas_api()
pandas_total_like_df.plot.bar(y='likes')

## Most disliked videos

In [34]:
highest_dislike = view_df.groupBy('title').agg(max('dislikes').alias('dislikes'))
highest_dislike = highest_dislike.orderBy(col("dislikes").desc())

In [35]:
highest_dislike.show()

+-------------------------+--------+
|                    title|dislikes|
+-------------------------+--------+
|                So Sorry.| 1944971|
|     YouTube Rewind: T...| 1753274|
|     Lucas Lucco e Pab...|  421473|
|     Childish Gambino ...|  379862|
|      LOGAN PAUL IS BACK!|  280675|
|     PSA from Chairman...|  258504|
|     Bad Bunny - Amorf...|  215530|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'FAKE...|  206892|
|     Fergie Performs T...|  193053|
|     Katy Perry - Hey ...|  171027|
|     Logan Paul - SANT...|  167640|
|     Te Bote Remix - C...|  166549|
|     Shakira - Trap (O...|  155954|
|     Ozuna x Romeo San...|  151147|
|     Ariana Grande - N...|  150086|
|     Nicky Jam x J. Ba...|  149275|
|     Jason Derulo, Mal...|  149024|
|     Becky G, Natti Na...|  142569|
|     Luis Fonsi, Demi ...|  137938|
|     Taylor Swift - De...|  133428|
+-------------------------+--------+
only showing top 20 rows



### Plot

In [48]:
pandas_total_dl_df = highest_dislike.pandas_api()
pandas_total_dl_df.plot.bar(y='dislikes')

## Most commented videos

In [36]:
highest_cmt = view_df.groupBy('title').agg(max('comment_count').alias('comment_count'))
highest_cmt = highest_cmt.orderBy(col("comment_count").desc())

In [37]:
highest_cmt.show()

+-------------------------+-------------+
|                    title|comment_count|
+-------------------------+-------------+
|                So Sorry.|      1626501|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'FAKE...|      1228655|
|     YouTube Rewind: T...|       845233|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'MIC ...|       556154|
|     Childish Gambino ...|       553371|
|     j-hope 'Daydream ...|       477233|
|      LOGAN PAUL IS BACK!|       432534|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) LOVE ...|       371864|
|     Marvel Studios' A...|       368739|
|       Drake - God‚Äôs Plan|       301756|
|     Lucas Lucco e Pab...|       275795|
|     TWICE What is Lov...|       274087|
|     Ariana Grande - N...|       259613|
|BTS (Î∞©ÌÉÑÏÜåÎÖÑÎã®) 'Euph...|       234455|
|            GOT7 Look M/V|       234054|
|              Face Reveal|       225618|
|     Bad Bunny - Amorf...|       225216|
|     Logan Paul - SANT...|       212787|
|     Regarding the rum...|       205692|
|     Marvel Studios' A...|       199294|
+-----------

### Plot

In [49]:
pandas_total_cmt_df = highest_cmt.pandas_api()
pandas_total_cmt_df.plot.bar(y='comment_count')