In [1]:
#innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark 
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark


### create a spark session.spark session allow us to access the features of spark through python

In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

### importing necessary spark functinality and nltk tool-kit for text preprocessing

In [4]:
from pyspark.sql.functions import udf, col, lower, regexp_replace,concat,lit,split
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import nltk
import string
import re
from nltk.stem.snowball import SnowballStemmer 
from pyspark.sql.functions import udf
import pyspark.sql.types as T
import pyspark.sql.functions as F
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk.corpus import wordnet as wn


### loading data

In [5]:
!wget -q !wget  https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip?dl=0


In [6]:
! unzip /content/all-the-news-2-1.zip?dl=0


Archive:  /content/all-the-news-2-1.zip?dl=0
  inflating: all-the-news-2-1.csv    


In [7]:
data= spark.read.csv("/content/all-the-news-2-1.csv",header=True,inferSchema=True)


In [8]:
data.show()

+--------------------+--------------------+-------------------+----------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _c0|          Unnamed: 0|               date|            year|month| day|              author|               title|             article|                 url|             section|         publication|
+--------------------+--------------------+-------------------+----------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   0|                   0|2016-12-09 18:31:00|            2016| 12.0|   9|         Lee Drutman|We should take co...|"This post is par...|             however| several critics ...|         for example|
|                   1|                   1|2016-10-07 21:26:46|            2016| 10.0|   7|         Scott Davis|Colts GM Ryan Gr

### concatenet columns containg text for easy evaluation

In [9]:
data = data.withColumn('text', F.concat(F.col('title'), F.col('article'))) 

In [10]:
data.show()

+--------------------+--------------------+-------------------+----------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _c0|          Unnamed: 0|               date|            year|month| day|              author|               title|             article|                 url|             section|         publication|                text|
+--------------------+--------------------+-------------------+----------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   0|                   0|2016-12-09 18:31:00|            2016| 12.0|   9|         Lee Drutman|We should take co...|"This post is par...|             however| several critics ...|         for example|We should take co...|
|                   1|                   1|2

### droping another columns and keeps only combined text column

In [11]:
data = data.drop(*data.columns[:12])

In [12]:
data.show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                                text|
+----------------------------------------------------------------------------------------------------+
|We should take concerns about the health of liberal democracy seriously"This post is part of Poly...|
|Colts GM Ryan Grigson says Andrew Luck's contract makes it difficult to build the team The Indian...|
|                                                                                                null|
|Trump denies report he ordered Mueller firedDAVOS, Switzerland (Reuters) - U.S. President Donald ...|
|France's Sarkozy reveals his 'Passions' but insists no come-back on cardsPARIS (Reuters) - Former...|
|Paris Hilton: Woman In Black For Uncle Monty's Funeral"Paris Hilton arrived at LAX Wednesday dres...|
|ECB's Coeure: If we decide to cut rates, we'd have to consider tieringBE

### droping null values within the column

In [13]:
data = data.dropna()

In [14]:
data.show()

+--------------------+
|                text|
+--------------------+
|We should take co...|
|Colts GM Ryan Gri...|
|Trump denies repo...|
|France's Sarkozy ...|
|Paris Hilton: Wom...|
|ECB's Coeure: If ...|
|Venezuela detains...|
|You Can Trick You...|
|How to watch the ...|
|China is dismissi...|
|“Elizabeth Warren...|
|Hudson's Bay's ch...|
|Joakim Noah's Vic...|
|Jermaine Jackson ...|
|UK PM May presses...|
|Nancy Pelosi says...|
|The government of...|
|Mark Zuckerberg’s...|
|Girl Scouts Are T...|
|An Animated Maste...|
+--------------------+
only showing top 20 rows



In [15]:
returnType=T.ArrayType(T.StringType())

###text cleaning

In [16]:
#Clean text
df_clean = data.select (lower(regexp_replace('text', "[^a-zA-Z\\s]", "")).alias('text'))



In [17]:
df_clean.show()

+--------------------+
|                text|
+--------------------+
|we should take co...|
|colts gm ryan gri...|
|trump denies repo...|
|frances sarkozy r...|
|paris hilton woma...|
|ecbs coeure if we...|
|venezuela detains...|
|you can trick you...|
|how to watch the ...|
|china is dismissi...|
|elizabeth warren ...|
|hudsons bays chai...|
|joakim noahs vict...|
|jermaine jackson ...|
|uk pm may presses...|
|nancy pelosi says...|
|the government of...|
|mark zuckerbergs ...|
|girl scouts are t...|
|an animated maste...|
+--------------------+
only showing top 20 rows



In [18]:
# Tokenize text
tokenizer = Tokenizer(inputCol='text', outputCol='words_token')
df_words_token = tokenizer.transform(df_clean).select('words_token')


In [19]:

# Remove stop words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df_words_no_stopw = remover.transform(df_words_token).select('words_clean')


In [20]:
#stem_text
stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], returnType)
df_stemmed = df_words_no_stopw.withColumn("words_stemmed",stemmer_udf("words_clean")).select('words_stemmed')


In [21]:

# Filter length word > 3
filter_length_udf = udf(lambda row: [x for x in row if len(x) >= 3], returnType)
df_final_words = df_stemmed.withColumn('words', filter_length_udf(col('words_stemmed')))


In [22]:
df_final_words.show()

+--------------------+--------------------+
|       words_stemmed|               words|
+--------------------+--------------------+
|[take, concern, h...|[take, concern, h...|
|[colt, gm, ryan, ...|[colt, ryan, grig...|
|[trump, deni, rep...|[trump, deni, rep...|
|[franc, sarkozi, ...|[franc, sarkozi, ...|
|[pari, hilton, wo...|[pari, hilton, wo...|
|[ecb, coeur, deci...|[ecb, coeur, deci...|
|[venezuela, detai...|[venezuela, detai...|
|[trick, brain, fo...|[trick, brain, fo...|
|[watch, googl, io...|[watch, googl, ke...|
|[china, dismiss, ...|[china, dismiss, ...|
|[elizabeth, warre...|[elizabeth, warre...|
|[hudson, bay, cha...|[hudson, bay, cha...|
|[joakim, noah, vi...|[joakim, noah, vi...|
|[jermain, jackson...|[jermain, jackson...|
|[uk, pm, may, pre...|[may, press, bid,...|
|[nanci, pelosi, s...|[nanci, pelosi, s...|
|[govern, offici, ...|[govern, offici, ...|
|[mark, zuckerberg...|[mark, zuckerberg...|
|[girl, scout, tra...|[girl, scout, tra...|
|[anim, masterpiec...|[anim, mas

In [23]:
df = df_final_words.drop("words_stemmed")


In [24]:
df.show()

+--------------------+
|               words|
+--------------------+
|[take, concern, h...|
|[colt, ryan, grig...|
|[trump, deni, rep...|
|[franc, sarkozi, ...|
|[pari, hilton, wo...|
|[ecb, coeur, deci...|
|[venezuela, detai...|
|[trick, brain, fo...|
|[watch, googl, ke...|
|[china, dismiss, ...|
|[elizabeth, warre...|
|[hudson, bay, cha...|
|[joakim, noah, vi...|
|[jermain, jackson...|
|[may, press, bid,...|
|[nanci, pelosi, s...|
|[govern, offici, ...|
|[mark, zuckerberg...|
|[girl, scout, tra...|
|[anim, masterpiec...|
+--------------------+
only showing top 20 rows



In [25]:
from pyspark.sql.functions import explode,col
words = df.withColumn('exploded_words', explode(col('words')))


In [26]:
data=words.drop('words')

In [27]:
data.show()

+--------------+
|exploded_words|
+--------------+
|          take|
|       concern|
|        health|
|         liber|
|     democraci|
|  seriouslythi|
|          post|
|          part|
|     polyarchi|
|      independ|
|          blog|
|        produc|
|         polit|
|        reform|
|       program|
|           new|
|       america|
|    washington|
|         think|
|          tank|
+--------------+
only showing top 20 rows



### grouping the words 

In [28]:
words_count = data.groupby("exploded_words").count()

### top 100 most occuring word

In [29]:
most_frequent_words=words_count.orderBy("count",ascending=False).show(100)


+--------------+-------+
|exploded_words|  count|
+--------------+-------+
|          said|4942494|
|          year|2566917|
|           new|2364457|
|         trump|2213562|
|           one|2118739|
|          like|1944089|
|        report|1904708|
|          time|1755639|
|         peopl|1722846|
|       compani|1708532|
|          also|1705723|
|           say|1671660|
|         state|1609934|
|         first|1329130|
|        presid|1324947|
|          make|1314710|
|       percent|1294333|
|          last|1272743|
|          work|1251001|
|           get|1242233|
|           two|1225066|
|       million|1164004|
|           use|1157855|
|        reuter|1064547|
|           day|1052805|
|          week|1019983|
|        includ|1002771|
|          take| 967602|
|          even| 951963|
|          show| 940870|
|        govern| 929440|
|         month| 923949|
|          call| 907596|
|          want| 904137|
|          back| 893424|
|          hous| 893167|
|           may| 872264|
