## Import thư viện

In [1]:
import pyspark
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, count, when, to_timestamp, split, regexp_replace, row_number, sum, count_distinct
from functools import reduce
import pyspark.pandas as ps



## Khởi tạo Spark Session

In [2]:
spark = SparkSession.builder \
    .appName("Analysis") \
    .getOrCreate()

## Đọc file data

In [8]:
category_df = spark.read.csv(
    './data/processed_data.csv', header=True, inferSchema=True,
    quote='"' , escape ='"', multiLine=True)
# quote, escape: ensure that the values of 'tags' column doesn't shift

In [9]:
category_df.show()

+---+-------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+-------+--------+-------------+--------------------+
|_c0|trending_date|               title|       channel_title|    category_id|       publish_time|                tags|   views|  likes|dislikes|comment_count|         description|
+---+-------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+-------+--------+-------------+--------------------+
|  0|   2017-11-14|John Lewis Christ...|          John Lewis|  Howto & Style|2017-11-10 07:38:29|['christmas', 'jo...| 7224515|  55681|   10247|         9479|Click here to con...|
|  1|   2017-11-14|Taylor Swift: …Re...| Saturday Night Live|  Entertainment|2017-11-12 06:24:44|['SNL', 'Saturday...| 1053632|  25561|    2294|         2757|Musical guest Tay...|
|  2|   2017-11-14|Eminem - Walk On ...|          EminemVEVO|          Music|2017-11-10 17:00:03|['E

## Phân tích

### Số category

In [10]:
category_df.select('category_id').distinct().show()

+--------------------+
|         category_id|
+--------------------+
|           Education|
|              Gaming|
|       Entertainment|
|     Travel & Events|
|Science & Technology|
|              Sports|
|       Howto & Style|
|    Film & Animation|
|      People & Blogs|
|     News & Politics|
|      Pets & Animals|
|    Autos & Vehicles|
|               Music|
|              Comedy|
+--------------------+



In [11]:
category_df.select('category_id').distinct().count()

14

### Category nằm top trending nhiều nhất (1vid/ngày = 1 lần)

In [12]:
category_df.groupBy('category_id').count().orderBy("count", ascending=False).show()

+--------------------+-----+
|         category_id|count|
+--------------------+-----+
|               Music|13754|
|       Entertainment| 9124|
|      People & Blogs| 2926|
|    Film & Animation| 2577|
|       Howto & Style| 1928|
|              Sports| 1907|
|              Comedy| 1828|
|              Gaming| 1788|
|     News & Politics| 1225|
|      Pets & Animals|  534|
|Science & Technology|  518|
|           Education|  457|
|    Autos & Vehicles|  144|
|     Travel & Events|   96|
+--------------------+-----+



#### Chuyển thành pandas-on-Spark DataFrame và plot

In [13]:
pandas_category_df = category_df.pandas_api()
pandas_category_df['category_id'].value_counts().plot.bar()



### Category có tổng số lượng views cao nhất (Tổng số view (cao nhất) của các vid thuộc category cụ thể)

#### Tổng số vid

In [14]:
category_df.select('title').distinct().show()

+-----------------------+
|                  title|
+-----------------------+
|   Google Pixelbook:...|
|   Fall Out Boy - HO...|
|   The Poop In My Pa...|
|   New Bon Iver song...|
|   Amara La Negra - ...|
|   Kim Kardashian an...|
|   SmackDown LIVE GM...|
|   The Last Jedi Nov...|
|   Sean 'Love' Combs...|
|   WHAT I WORE & DID...|
|   Joyner Lucas & Ch...|
|   The Most Famous A...|
|   Honest Trailers -...|
|   Reviewing Zombie ...|
|   Serious Questions...|
|Wanna One (워너원) -...|
|   I Picked My Girlf...|
|   New album Open He...|
|   WWE fan interrupt...|
|   ASOS UNBOXING HAU...|
+-----------------------+
only showing top 20 rows



In [15]:
category_df.select('title').distinct().count()

3363

#### Lọc lấy số view cao nhất của từng vid

In [16]:
windowSpec = Window.partitionBy("title").orderBy(col("views").desc())
windowSpec

<pyspark.sql.window.WindowSpec at 0x260b5675be0>

In [17]:
view_df = category_df.withColumn("view_rank", row_number().over(windowSpec))
view_df.show()

+-----+-------------+--------------------+-----------------+-------------+-------------------+--------------------+------+-----+--------+-------------+--------------------+---------+
|  _c0|trending_date|               title|    channel_title|  category_id|       publish_time|                tags| views|likes|dislikes|comment_count|         description|view_rank|
+-----+-------------+--------------------+-----------------+-------------+-------------------+--------------------+------+-----+--------+-------------+--------------------+---------+
|  567|   2017-11-16|#21 How to go FAS...|       Ben Cathro|       Sports|2017-11-05 20:10:16|['Ben Cathro', 's...| 16074|  689|       8|          142|How flipping rad ...|        1|
|  358|   2017-11-15|#21 How to go FAS...|       Ben Cathro|       Sports|2017-11-05 20:10:16|['Ben Cathro', 's...| 15818|  686|       8|          141|How flipping rad ...|        2|
|  146|   2017-11-14|#21 How to go FAS...|       Ben Cathro|       Sports|2017-11-05 

In [18]:
view_df = view_df.filter(col('view_rank') == 1)
view_df.show()

+-----+-------------+--------------------+-----------------+----------------+-------------------+--------------------+--------+------+--------+-------------+--------------------+---------+
|  _c0|trending_date|               title|    channel_title|     category_id|       publish_time|                tags|   views| likes|dislikes|comment_count|         description|view_rank|
+-----+-------------+--------------------+-----------------+----------------+-------------------+--------------------+--------+------+--------+-------------+--------------------+---------+
|  567|   2017-11-16|#21 How to go FAS...|       Ben Cathro|          Sports|2017-11-05 20:10:16|['Ben Cathro', 's...|   16074|   689|       8|          142|How flipping rad ...|        1|
|16330|   2018-02-05|#ConanHaiti Previ...|        Team Coco|          Comedy|2018-01-23 03:52:25|                ['']|  528163| 11467|    1619|         1731|President Trump r...|        1|
|36876|   2018-06-03|#Laurel? #Yanny? ...|  The White H

#### Tính tổng số view theo category

In [19]:
total_view_df = view_df.groupBy('category_id').agg(sum('views').alias('total_views')).orderBy("total_views", ascending=False)
total_view_df = total_view_df.withColumn("total_views", col("total_views").cast("long"))
total_view_df.show()

+--------------------+-----------+
|         category_id|total_views|
+--------------------+-----------+
|               Music|11638088588|
|       Entertainment| 2261526855|
|    Film & Animation|  489044774|
|              Comedy|  416205730|
|      People & Blogs|  362990502|
|              Sports|  230310607|
|              Gaming|  143244559|
|     News & Politics|   99064864|
|       Howto & Style|   91930783|
|Science & Technology|   85714412|
|      Pets & Animals|   37028580|
|           Education|   29669336|
|    Autos & Vehicles|   20406847|
|     Travel & Events|   10717623|
+--------------------+-----------+



#### Chuyển thành pandas-on-Spark DataFrame và plot

In [20]:
pandas_total_view_df = total_view_df.pandas_api()
pandas_total_view_df.plot.bar(x='category_id', y='total_views')

### Category có nhiều kênh youtube làm nhất

In [21]:
channel_df = category_df.groupBy('category_id').agg(count_distinct('channel_title').alias('total_channels')).orderBy("total_channels", ascending=False)
channel_df.show()

+--------------------+--------------+
|         category_id|total_channels|
+--------------------+--------------+
|               Music|           505|
|       Entertainment|           339|
|      People & Blogs|           188|
|              Gaming|           124|
|    Film & Animation|           103|
|              Sports|            92|
|       Howto & Style|            82|
|              Comedy|            75|
|     News & Politics|            61|
|Science & Technology|            34|
|           Education|            31|
|      Pets & Animals|            25|
|    Autos & Vehicles|            14|
|     Travel & Events|             9|
+--------------------+--------------+



#### Chuyển thành pandas-on-Spark DataFrame và plot

In [22]:
pandas_channel_df = channel_df.pandas_api()
pandas_channel_df.plot.bar(x='category_id', y='total_channels')