In [32]:
from __future__ import print_function
import sys
from operator import add
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import to_date

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df = spark.read.csv('drive/My Drive/Colab Notebooks/news.csv', inferSchema=True, header=True)

In [3]:
df.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|             authors|               title|        publish_date|         description|                text|                 url|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|        ['Cbc News']|Coronavirus a 'wa...| 2020-03-27 08:00:00|Canadian pharmaci...|"Canadian pharmac...|"" he said.  Tadr...|
|  1|        ['Cbc News']|Yukon gov't names...| 2020-03-27 01:45:00|The Yukon governm...|"The Yukon govern...|  from March 9 to 13|
|  2|['The Associated ...|U.S. Senate passe...| 2020-03-26 05:13:00|The Senate has pa...|"The Senate late ...|"" said Democrati...|
|  3|        ['Cbc News']|Coronavirus: The ...| 2020-03-27 00:36:00|Scientists around...|"Scientists aroun...| "" said Zarychanski|
|  4|        ['Cbc News']|The latest on the...| 2020-03-26 20:57:00|The late

In [17]:
df.select("publish_date").show()

+--------------------+
|        publish_date|
+--------------------+
| 2020-03-27 08:00:00|
| 2020-03-27 01:45:00|
| 2020-03-26 05:13:00|
| 2020-03-27 00:36:00|
| 2020-03-26 20:57:00|
| 2020-03-27 08:00:00|
| 2020-03-27 08:00:00|
| 2020-03-26 11:02:00|
| 2020-03-26 14:55:00|
| 2020-03-27 08:00:00|
| 2020-03-26 14:18:00|
|            'Follow'|
| 2020-03-27 08:00:00|
| 'James Dunne On ...|
| 2020-03-26 12:00:00|
| 'He Spent Eight ...|
| 2020-03-27 08:30:00|
| 2020-03-11 00:15:00|
|         'Twitter']"|
| 2020-03-27 08:00:00|
+--------------------+
only showing top 20 rows



In [61]:
#convert timestamp to date
df2 = df.withColumn("date",
              to_date(df.publish_date, "yyyy-MM-dd"))

#pull hour from timestamp
from pyspark.sql.functions import col, hour, minute, second

df2 = df2.withColumn("hour", hour(col("publish_date"))).withColumn("minute", minute(col("publish_date"))).withColumn("second", second(col("publish_date")))

df2.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----+------+------+
|_c0|             authors|               title|        publish_date|         description|                text|                 url|      date|hour|minute|second|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----+------+------+
|  0|        ['Cbc News']|Coronavirus a 'wa...| 2020-03-27 08:00:00|Canadian pharmaci...|"Canadian pharmac...|"" he said.  Tadr...|2020-03-27|   8|     0|     0|
|  1|        ['Cbc News']|Yukon gov't names...| 2020-03-27 01:45:00|The Yukon governm...|"The Yukon govern...|  from March 9 to 13|2020-03-27|   1|    45|     0|
|  2|['The Associated ...|U.S. Senate passe...| 2020-03-26 05:13:00|The Senate has pa...|"The Senate late ...|"" said Democrati...|2020-03-26|   5|    13|     0|
|  3|        ['Cbc News']|Co

In [68]:
df3 = df2.filter(df2.hour.isNotNull())
df3 = df3.filter(df3.authors!="[]")
df3.show()

+---+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+----------+----+------+------+
|_c0|             authors|               title|       publish_date|         description|                text|                 url|      date|hour|minute|second|
+---+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+----------+----+------+------+
|  0|        ['Cbc News']|Coronavirus a 'wa...|2020-03-27 08:00:00|Canadian pharmaci...|"Canadian pharmac...|"" he said.  Tadr...|2020-03-27|   8|     0|     0|
|  1|        ['Cbc News']|Yukon gov't names...|2020-03-27 01:45:00|The Yukon governm...|"The Yukon govern...|  from March 9 to 13|2020-03-27|   1|    45|     0|
|  2|['The Associated ...|U.S. Senate passe...|2020-03-26 05:13:00|The Senate has pa...|"The Senate late ...|"" said Democrati...|2020-03-26|   5|    13|     0|
|  3|        ['Cbc News']|Coronavi

In [94]:
df3.select("hour").show()

+----+
|hour|
+----+
|   8|
|   1|
|   5|
|   0|
|  20|
|   8|
|   8|
|  11|
|  14|
|   8|
|  14|
|   8|
|  12|
|   8|
|  21|
|  11|
|   0|
|   8|
|  17|
|  18|
+----+
only showing top 20 rows



In [107]:
df3.select("minute").show()

+------+
|minute|
+------+
|     0|
|    45|
|    13|
|    36|
|    57|
|     0|
|     0|
|     2|
|    55|
|     0|
|    18|
|     0|
|     0|
|     0|
|     3|
|    17|
|    38|
|     0|
|    54|
|    22|
+------+
only showing top 20 rows



In [108]:
df3.select("second").show()

+------+
|second|
+------+
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
|     0|
+------+
only showing top 20 rows

