In [1]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash
!pip install transformers
!pip install spacy
!pip install yake

--2021-11-27 18:17:07--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-11-27 18:17:07--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-11-27 18:17:08--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

In [None]:
#!unzip '/content/drive/MyDrive/all_the_news.zip'

In [4]:
from glob import glob
fp = [f for f in glob('/content/drive/**', recursive=True) if 'youtube_clean_' in f and '.json' in f][0]
fp_news = '/content/drive/MyDrive/all-the-news-2-1.csv'

In [5]:
import sparknlp
from pyspark.sql import SparkSession

spark = sparknlp.start()
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.2")\
    .getOrCreate()

In [6]:
from pyspark.sql.functions import col, year, month, udf, concat, explode
from pyspark.sql.types import StringType, ArrayType

In [7]:
preprocess_article = udf(lambda x: pre_process(x), ArrayType(StringType()) )

In [8]:
news = spark.read.format("org.apache.spark.csv").option("multiline", "true").csv(fp_news, inferSchema=True, header=True, quote='"', sep = ",", escape='\"')
news.show(3)

+---+----------+-------------------+----+-----+---+-----------+--------------------+--------------------+--------------------+-------+----------------+
|_c0|Unnamed: 0|               date|year|month|day|     author|               title|             article|                 url|section|     publication|
+---+----------+-------------------+----+-----+---+-----------+--------------------+--------------------+--------------------+-------+----------------+
|  0|         0|2016-12-09 18:31:00|2016| 12.0|  9|Lee Drutman|We should take co...|This post is part...|https://www.vox.c...|   null|             Vox|
|  1|         1|2016-10-07 21:26:46|2016| 10.0|  7|Scott Davis|Colts GM Ryan Gri...| The Indianapolis...|https://www.busin...|   null|Business Insider|
|  2|         2|2018-01-26 00:00:00|2018|  1.0| 26|       null|Trump denies repo...|DAVOS, Switzerlan...|https://www.reute...|  Davos|         Reuters|
+---+----------+-------------------+----+-----+---+-----------+--------------------+----

In [9]:
from sparknlp.annotator import *
from sparknlp.base import *

In [10]:
document_assembler = DocumentAssembler().setInputCol('article').setOutputCol('document')

sentence_detector = SentenceDetector() \
    .setInputCols(['document']) \
    .setOutputCol('sentences')

tokenizer = Tokenizer() \
    .setInputCols(['sentences']) \
    .setOutputCol('tokens') \
    .setContextChars(['(', ')', '?','!','.',','])

keywords = YakeKeywordExtraction() \
    .setInputCols('tokens') \
    .setOutputCol('keywords') \
    .setMinNGrams(2) \
    .setMaxNGrams(5) \
    .setNKeywords(5) \
    .setStopWords(StopWordsCleaner().getStopWords())


pipeline_news = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    keywords,
])
model_pipeline = pipeline_news.fit(news)
news_transformed = model_pipeline.transform(news)

In [11]:
extract_keywords = udf(lambda x: list(set([j.result for j in x])), ArrayType(StringType()))
news_transformed = news_transformed.withColumn('keywords_only', extract_keywords(news_transformed.keywords ))
news_transformed = news_transformed.withColumn('month', news_transformed.month.cast('integer'))
news_transformed = news_transformed.withColumn('year_month', concat(news_transformed.year, news_transformed.month ))
news_transformed_exploded = news_transformed.select(news_transformed.year_month, explode(news_transformed.keywords_only))
news_transformed_keyword_count = news_transformed_exploded.groupBy(['year_month','col']).count()

In [12]:
news_transformed_keyword_count.show(3)

+----------+------------------+-----+
|year_month|               col|count|
+----------+------------------+-----+
|     20196|benoit coeure said|    5|
|     20162| republican senate|   11|
|     20174|   united airlines|  208|
+----------+------------------+-----+
only showing top 3 rows



In [None]:
news_transformed_keyword_count.coalesce(1).write.format('json').save('/content/drive/MyDrive/news_keywords_count.json')