## Configure PySpark Setup

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"


import findspark
findspark.init()


import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("App").getOrCreate()
spark

In [2]:
# Load Libraries


In [3]:
# check number of cores PySpark is using
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")

You are working with 1 core(s)


## Load Datasets


In [None]:
!cp /content/drive/MyDrive/Datasets.zip .
!unzip Datasets.zip

# Load Libraries

In [24]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import pyspark.sql.functions as f



## Reading Data into PySpark

In [7]:
path='Datasets/'
data = spark.read.csv(path+'youtubevideos.csv',inferSchema=True,header=True)

In [8]:
data.limit(3).toPandas()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"""last week tonight trump presidency""|""last wee...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"""racist superman""|""rudy""|""mancuso""|""king""|""bac...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...


In [9]:
print(data.printSchema())

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)

None


# isNan vs isNull

null values represents "no value" or "nothing", it's not even an empty string or zero. It can be used to represent that nothing useful exists.

NaN stands for "Not a Number", it's usually the result of a mathematical operation that doesn't make sense, e.g. 0.0/0.0.

In [11]:
# count missing data for each column
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+--------+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+--------------+-----------------+----------------+----------------------+-----------+
|video_id|trending_date|title|channel_title|category_id|publish_time|tags|views|likes|dislikes|comment_count|thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|description|
+--------+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+--------------+-----------------+----------------+----------------------+-----------+
|       0|         5752| 5902|         6025|       6292|        6463|7021| 7076| 7094|    7102|         7102|          7102|             7115|            7177|                  7177|       7747|
+--------+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+--------------+-----------------+----------------+----------------------+-----------+



In [13]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).toPandas().T

Unnamed: 0,0
video_id,0
trending_date,5752
title,5902
channel_title,6025
category_id,6292
publish_time,6463
tags,7021
views,7076
likes,7094
dislikes,7102


In [None]:
help(to_date)

In [17]:
# changing data type 

df = data.withColumn("views", data["views"].cast(IntegerType())) \
        .withColumn("likes", data["likes"].cast(IntegerType())) \
        .withColumn("dislikes", data["dislikes"].cast(IntegerType())) \
        .withColumn("trending_date", to_date(data['trending_date'], 'dd.mm.yy'))

In [18]:
print(df.printSchema())

root
 |-- video_id: string (nullable = true)
 |-- trending_date: date (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)

None


In [19]:
# rename columns
renamed = df.withColumnRenamed('channel_title','channel_title_new')
renamed.limit(4).toPandas()

Unnamed: 0,video_id,trending_date,title,channel_title_new,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,2011-01-17,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,2011-01-17,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"""last week tonight trump presidency""|""last wee...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,2011-01-17,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"""racist superman""|""rudy""|""mancuso""|""king""|""bac...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,2011-01-17,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"""rhett and link""|""gmm""|""good mythical morning""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...


In [20]:
renamed.explain()

== Physical Plan ==
*(1) Project [video_id#10, cast(cast(unix_timestamp(trending_date#11, dd.mm.yy, Some(Etc/UTC)) as timestamp) as date) AS trending_date#398, title#12, channel_title#13 AS channel_title_new#415, category_id#14, publish_time#15, tags#16, cast(views#17 as int) AS views#347, cast(likes#18 as int) AS likes#364, cast(dislikes#19 as int) AS dislikes#381, comment_count#20, thumbnail_link#21, comments_disabled#22, ratings_disabled#23, video_error_or_removed#24, description#25]
+- *(1) FileScan csv [video_id#10,trending_date#11,title#12,channel_title#13,category_id#14,publish_time#15,tags#16,views#17,likes#18,dislikes#19,comment_count#20,thumbnail_link#21,comments_disabled#22,ratings_disabled#23,video_error_or_removed#24,description#25] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/content/Datasets/youtubevideos.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<video_id:string,trending_date:string,title:string,channel_title:string,category_id:s

In [21]:
# replace characters in data frame
df = df.withColumn('publish_time_2',regexp_replace(df['publish_time'], 'T', ' '))
df = df.withColumn('publish_time_2',regexp_replace(df['publish_time_2'], 'Z', ''))
df = df.withColumn("publish_time_3", to_timestamp(df['publish_time_2'], 'yyyy-MM-dd HH:mm:ss.SSS'))
print(df.printSchema())
df.select("publish_time", "publish_time_2","publish_time_3").show(5,False)

root
 |-- video_id: string (nullable = true)
 |-- trending_date: date (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)
 |-- publish_time_2: string (nullable = true)
 |-- publish_time_3: timestamp (nullable = true)

None
+------------------------+-----------------------+-------------------+
|publish_time            |publish_time_2         |publish_time_3     |
+------------------------+-----------------------+------------

In [23]:
# traslate function in PySpark
help(translate)

Help on function translate in module pyspark.sql.functions:

translate(srcCol, matching, replace)
    A function translate any character in the `srcCol` by a character in `matching`.
    The characters in `replace` is corresponding to the characters in `matching`.
    The translate will happen when any character in the string matching with the character
    in the `matching`.
    
    >>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \
    ...     .alias('r')).collect()
    [Row(r='1a2s3ae')]
    
    .. versionadded:: 1.5



In [25]:
df.select("publish_time",translate(f.col("publish_time"), "TZ", " ").alias("translate_func")).show(5,False)

+------------------------+-----------------------+
|publish_time            |translate_func         |
+------------------------+-----------------------+
|2017-11-13T17:13:01.000Z|2017-11-13 17:13:01.000|
|2017-11-13T07:30:00.000Z|2017-11-13 07:30:00.000|
|2017-11-12T19:05:24.000Z|2017-11-12 19:05:24.000|
|2017-11-13T11:00:04.000Z|2017-11-13 11:00:04.000|
|2017-11-12T18:01:41.000Z|2017-11-12 18:01:41.000|
+------------------------+-----------------------+
only showing top 5 rows



In [27]:
# trim strings
df = df.withColumn('title',trim(df['title'])) # or rtrim/ltrim
df.select("title").show(5,False)

+--------------------------------------------------------------+
|title                                                         |
+--------------------------------------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE                            |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)|
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons         |
|Nickelback Lyrics: Real or Fake?                              |
|I Dare You: GOING BALD!?                                      |
+--------------------------------------------------------------+
only showing top 5 rows



In [28]:
# lower case
df = df.withColumn('title',lower(df['title'])) 
df.select("title").show(5,False)

+--------------------------------------------------------------+
|title                                                         |
+--------------------------------------------------------------+
|we want to talk about our marriage                            |
|the trump presidency: last week tonight with john oliver (hbo)|
|racist superman | rudy mancuso, king bach & lele pons         |
|nickelback lyrics: real or fake?                              |
|i dare you: going bald!?                                      |
+--------------------------------------------------------------+
only showing top 5 rows



In [29]:
# create new columns based on condition using "when" in sql functions
df.select("likes","dislikes",(when(df['likes'] > df['dislikes'], 'Good').when(df['likes'] < df['dislikes'], 'Bad').otherwise('Undetermined')).alias("Favorability")).show(3)


+------+--------+------------+
| likes|dislikes|Favorability|
+------+--------+------------+
| 57527|    2966|        Good|
| 97185|    6146|        Good|
|146033|    5339|        Good|
+------+--------+------------+
only showing top 3 rows



In [30]:
# create new columns based on condition using "expr " in sql functions
df.select("likes","dislikes",expr("CASE WHEN likes > dislikes THEN  'Good' WHEN likes < dislikes THEN 'Bad' ELSE 'Undetermined' END AS Favorability")).show(3)


+------+--------+------------+
| likes|dislikes|Favorability|
+------+--------+------------+
| 57527|    2966|        Good|
| 97185|    6146|        Good|
|146033|    5339|        Good|
+------+--------+------------+
only showing top 3 rows



In [31]:
df.selectExpr("likes","dislikes","CASE WHEN likes > dislikes THEN  'Good' WHEN likes < dislikes THEN 'Bad' ELSE 'Undetermined' END AS Favorability").show(3)

+------+--------+------------+
| likes|dislikes|Favorability|
+------+--------+------------+
| 57527|    2966|        Good|
| 97185|    6146|        Good|
|146033|    5339|        Good|
+------+--------+------------+
only showing top 3 rows



In [33]:
# concatenate two string columns
df.select(concat_ws(' ', df['title'], df['channel_title'],df['tags']).alias('text')).show(4,False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [35]:
# extract month and year from date/timestamp
# Other options: dayofmonth, dayofweek, dayofyear, weekofyear
df.select("trending_date",year("trending_date"),month("trending_date")).show(5)

+-------------+-------------------+--------------------+
|trending_date|year(trending_date)|month(trending_date)|
+-------------+-------------------+--------------------+
|   2011-01-17|               2011|                   1|
|   2011-01-17|               2011|                   1|
|   2011-01-17|               2011|                   1|
|   2011-01-17|               2011|                   1|
|   2011-01-17|               2011|                   1|
+-------------+-------------------+--------------------+
only showing top 5 rows



In [39]:
df.select("trending_date",weekofyear("trending_date")).show(5)

+-------------+-------------------------+
|trending_date|weekofyear(trending_date)|
+-------------+-------------------------+
|   2011-01-17|                        3|
|   2011-01-17|                        3|
|   2011-01-17|                        3|
|   2011-01-17|                        3|
|   2011-01-17|                        3|
+-------------+-------------------------+
only showing top 5 rows



In [40]:
df.select("trending_date",weekofyear(df["trending_date"])).show(5)

+-------------+-------------------------+
|trending_date|weekofyear(trending_date)|
+-------------+-------------------------+
|   2011-01-17|                        3|
|   2011-01-17|                        3|
|   2011-01-17|                        3|
|   2011-01-17|                        3|
|   2011-01-17|                        3|
+-------------+-------------------------+
only showing top 5 rows



In [41]:
df.select("trending_date","publish_time_3",(datediff(df['trending_date'],df['publish_time_3'])/365).alias('diff')).show(5)

+-------------+-------------------+-------------------+
|trending_date|     publish_time_3|               diff|
+-------------+-------------------+-------------------+
|   2011-01-17|2017-11-13 17:13:01|-6.8273972602739725|
|   2011-01-17|2017-11-13 07:30:00|-6.8273972602739725|
|   2011-01-17|2017-11-12 19:05:24| -6.824657534246575|
|   2011-01-17|2017-11-13 11:00:04|-6.8273972602739725|
|   2011-01-17|2017-11-12 18:01:41| -6.824657534246575|
+-------------+-------------------+-------------------+
only showing top 5 rows



In [42]:
df.select("trending_date","publish_time_3",(datediff('trending_date','publish_time_3')/365).alias('diff')).show(5)

+-------------+-------------------+-------------------+
|trending_date|     publish_time_3|               diff|
+-------------+-------------------+-------------------+
|   2011-01-17|2017-11-13 17:13:01|-6.8273972602739725|
|   2011-01-17|2017-11-13 07:30:00|-6.8273972602739725|
|   2011-01-17|2017-11-12 19:05:24| -6.824657534246575|
|   2011-01-17|2017-11-13 11:00:04|-6.8273972602739725|
|   2011-01-17|2017-11-12 18:01:41| -6.824657534246575|
+-------------+-------------------+-------------------+
only showing top 5 rows



In [43]:
# split string
df.select(split(df['title'], ' ').alias('new')).show(4,False)

+-------------------------------------------------------------------------+
|new                                                                      |
+-------------------------------------------------------------------------+
|[we, want, to, talk, about, our, marriage]                               |
|[the, trump, presidency:, last, week, tonight, with, john, oliver, (hbo)]|
|[racist, superman, |, rudy, mancuso,, king, bach, &, lele, pons]         |
|[nickelback, lyrics:, real, or, fake?]                                   |
+-------------------------------------------------------------------------+
only showing top 4 rows



In [45]:
# working with arrays inside data frame
array_df = df.select("title",split('title', ' ').alias('title_array'))

array_df.select("title",array_contains(array_df['title_array'], "marriage")).show(5, False)

+--------------------------------------------------------------+-------------------------------------+
|title                                                         |array_contains(title_array, marriage)|
+--------------------------------------------------------------+-------------------------------------+
|we want to talk about our marriage                            |true                                 |
|the trump presidency: last week tonight with john oliver (hbo)|false                                |
|racist superman | rudy mancuso, king bach & lele pons         |false                                |
|nickelback lyrics: real or fake?                              |false                                |
|i dare you: going bald!?                                      |false                                |
+--------------------------------------------------------------+-------------------------------------+
only showing top 5 rows



In [46]:
# get rid of repeat values 
# get only unique values
array_df.select(array_distinct('title_array')).show(5, False)

+-------------------------------------------------------------------------+
|array_distinct(title_array)                                              |
+-------------------------------------------------------------------------+
|[we, want, to, talk, about, our, marriage]                               |
|[the, trump, presidency:, last, week, tonight, with, john, oliver, (hbo)]|
|[racist, superman, |, rudy, mancuso,, king, bach, &, lele, pons]         |
|[nickelback, lyrics:, real, or, fake?]                                   |
|[i, dare, you:, going, bald!?]                                           |
+-------------------------------------------------------------------------+
only showing top 5 rows



In [47]:
# Remove certian values
array_df.select(array_remove(array_df.title_array, "we")).show(5, False)

+-------------------------------------------------------------------------+
|array_remove(title_array, we)                                            |
+-------------------------------------------------------------------------+
|[want, to, talk, about, our, marriage]                                   |
|[the, trump, presidency:, last, week, tonight, with, john, oliver, (hbo)]|
|[racist, superman, |, rudy, mancuso,, king, bach, &, lele, pons]         |
|[nickelback, lyrics:, real, or, fake?]                                   |
|[i, dare, you:, going, bald!?]                                           |
+-------------------------------------------------------------------------+
only showing top 5 rows



In [52]:
# user define functions PySpark


def square(x):
    return int(x**2)

# convert python function to PySpark functions
square_udf = udf(lambda z: square(z), IntegerType())

df.select('dislikes',square_udf('dislikes').alias('likes_sq')).where(col('dislikes').isNotNull()).show()

+--------+--------+
|dislikes|likes_sq|
+--------+--------+
|    2966| 8797156|
|    6146|37773316|
|    5339|28504921|
|     666|  443556|
|    1989| 3956121|
|     511|  261121|
|    2445| 5978025|
|     778|  605284|
|     119|   14161|
|    1363| 1857769|
|      25|     625|
|     303|   91809|
|    1333| 1776889|
|    1171| 1371241|
|     246|   60516|
|      52|    2704|
|     638|  407044|
|      53|    2809|
|      36|    1296|
|     191|   36481|
+--------+--------+
only showing top 20 rows

