In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
#!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

In [None]:
import pyspark.sql
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import functions

In [None]:
# Reading all the articles
articles = sc.textFile('assignment2/data')

# Converting each text in the articles to json format
data = articles.map(lambda x: json.loads(x))

In [None]:
data.take(1)

[{'city': 'dibrugarh',
  'code': '14817500',
  'title': 'Six militants lay down arms at Jairampur',
  'text': "DIBRUGARH: Six militants, four from NSCN (K), one each from the NDFB and Ulfa laid down arms at the 18 Assam Rifles base camp at Jairampur in Changlang district of Arunachal Pradesh on Tuesday.The surrendered militants were identified as self-styled private Hokato Sema, Nongya Langching, Limnong Mossang and corporal Panba Wangsu of the NSCN (K), Bijoy Basumatary (NDFB) and Bhaben Hazarika of the Ulfa's anti-talks faction. They surrendered three 9 mm pistols, two 7.65 mm pistols, a .38 revolver and 2.9 kg of RDX.Brigadier Paritosh Pant, the commander of 25 sector Assam Rifles, who oversaw the surrender, said security forces have succeeded in creating a secure environment in Arunachal Pradesh's Changlang district and nearby areas by their relentless and sustained operation against militants groups. He also assured proper rehabilitation of the surrendered militants and appealed t

In [None]:
def preprocessing(x):

    # taking only the text park in the articles
    # converting the text to lower case and splitting the words at space
    files = x.flatMap(lambda x: x['text'].lower().split(' '))

    # Mapping each word to 1
    # reducing by words to calculate the overall frequency of the word
    freq = files.map(lambda x: (x,1)).reduceByKey(lambda x,y : x+y)

    return freq


In [None]:
# Preprocessing all the articles by month and saving as a dictionary
freq_month = {}

# Months from 1-12 filtering the articles by months, Preprocessing the article texts
# storing in a dictionary with keys as month numbers and values as the processed RDD files

for i in range(1,13):

    if len(str(i)) == 1:
        data_month = data.filter(lambda x: x['date'].startswith('2012-0'+str(i)))
        freq_word = preprocessing(data_month)
        freq_month[i] = freq_word
    else:
        data_month = data.filter(lambda x: x['date'].startswith('2012-'+str(i)))
        freq_word = preprocessing(data_month)
        freq_month[i] = freq_word

freq_month

{1: PythonRDD[453] at RDD at PythonRDD.scala:53,
 2: PythonRDD[454] at RDD at PythonRDD.scala:53,
 3: PythonRDD[455] at RDD at PythonRDD.scala:53,
 4: PythonRDD[456] at RDD at PythonRDD.scala:53,
 5: PythonRDD[457] at RDD at PythonRDD.scala:53,
 6: PythonRDD[458] at RDD at PythonRDD.scala:53,
 7: PythonRDD[459] at RDD at PythonRDD.scala:53,
 8: PythonRDD[460] at RDD at PythonRDD.scala:53,
 9: PythonRDD[461] at RDD at PythonRDD.scala:53,
 10: PythonRDD[462] at RDD at PythonRDD.scala:53,
 11: PythonRDD[463] at RDD at PythonRDD.scala:53,
 12: PythonRDD[464] at RDD at PythonRDD.scala:53}

In [None]:
# Merging each months RDD's to get the total word freq counts
freq_data = sc.union(list(freq_month.values())).reduceByKey(lambda x,y: x+y)

# filtering the words with frequency less than 10
freq_count = freq_data.filter(lambda x: x[1] >= 10).collect()
freq_count

[('also', 150597),
 ("ashwini's", 13),
 ('less', 8316),
 ('state', 88619),
 ('discipline', 683),
 ('facility', 3703),
 ('valve', 145),
 ('chamber', 1371),
 ('food,', 901),
 ('ministers', 4705),
 ('members,', 1987),
 ('organize', 1569),
 ('designer', 954),
 ("jnu's", 14),
 ('coimbatore', 2174),
 ('&amp;', 5617),
 ('trainers.', 15),
 ('pradhan', 737),
 ('agreeing', 340),
 ('venue', 1757),
 ('blood', 5607),
 ('allegedly,', 28),
 ('adding:', 178),
 ('permissions', 530),
 ('cheer', 429),
 ('landscape', 351),
 ('farms', 437),
 ('match-fixing', 43),
 ('children,', 2155),
 ('refreshingly', 13),
 ('beijing', 371),
 ('1966', 56),
 ('(pbd)', 11),
 ('predictable', 83),
 ('ladies', 680),
 ('age-old', 247),
 ('nodal', 831),
 ('18%,', 20),
 ('lankans', 55),
 ('acceptable', 431),
 ('persons.', 722),
 ('mockery', 159),
 ("microsoft's", 75),
 ('printers', 82),
 ('taluka', 1072),
 ('nscn', 248),
 ('user', 919),
 ('(27)', 196),
 ('guilty,"', 85),
 ('mgm', 151),
 ('away,', 810),
 ('assam,', 509),
 ('costly

In [None]:
# 1 . Total size of the output data (after the filtering)

print('Total size of the output data (after the filtering) is ', len(freq_count))

Total size of the output data (after the filtering) is  110622


In [None]:
# 2. Frequency of the following words – congress, london, washington, football

# filtering the total frequency RDD for the given words
freq_specific_words = freq_data.filter(lambda x: x[0] in ['congress','london','washington','football'])
freq_specific_words.collect()

[('congress', 28093),
 ('washington', 546),
 ('london', 2389),
 ('football', 1387)]

In [None]:

# 3. The word with maximum frequency for each month

for month,value in freq_month.items():
    print(month, value.max(key = lambda x: x[1]))


1 ('the', 245413)
2 ('the', 266950)
3 ('the', 283107)
4 ('the', 234165)
5 ('the', 309020)
6 ('the', 267383)
7 ('the', 294923)
8 ('the', 282393)
9 ('the', 264447)
10 ('the', 279876)
11 ('the', 305296)
12 ('the', 305414)


In [None]:
# 4. List of words appeared on ‘2012-09-01’ but not on ‘2012-08-01’

# filtering the articles that belong to 2012-08-01 and 2012-09-01 dates
data_august = data.filter(lambda x: x['date'] == '2012-08-01')
data_sept = data.filter(lambda x: x['date'] == '2012-09-01')

# preprocessing the data to get the words
words_august = preprocessing(data_august).map(lambda x: x[0]).collect()
words_sept = preprocessing(data_sept).map(lambda x: x[0]).collect()

# checking the words that are in 2012-09-01 articles but not in '2012-08-01' articles
words = []
for word in words_sept:
    if word not in words_august:
        words.append(word)

words


['valencia',
 'pmch,',
 "session's",
 'bhiwapur,',
 'tearful',
 'conventional',
 'age-old',
 "finger'",
 'regroup',
 'sachdev',
 'open:',
 'occupied.lda',
 'shrikanth,',
 'viswanathan.in',
 '(nvda),',
 'studies),',
 'mpakvn,',
 'forklift',
 'box)."forty',
 'waste)',
 'dangi,',
 'cyanide,',
 'speak,',
 'friday.kakodkar',
 'madina-tul-ilm,',
 'rukmani',
 'infantry',
 'hafizabad',
 'guarantees',
 'andrea',
 'everything."',
 'bureaucracy.',
 'narration',
 'officals',
 'paperwork',
 'cautious.',
 'boyfriends,',
 'case".full',
 'consciousness.on',
 'perspective."extra',
 'ton.',
 'apwuj)',
 'venkateshwarlu',
 'cmcm',
 'crore.emerging',
 'prabodhini',
 'that.in',
 '(beach',
 'talwar?',
 '(illegal',
 'gyanganga',
 'trainers.',
 'september.but',
 'argued.the',
 'jasmer',
 'bennett,',
 'india\'."tikekar,',
 'unauthorised,',
 'maqbool',
 'mockery',
 'birthday."we',
 'allahabad.the',
 'parkings',
 '978.54',
 'unsafe',
 'conserved',
 'imprisonment.',
 'railings',
 'interrogation.cases',
 'completio

In [None]:
# 5. Frequency of the word ‘monsoon’ for all months

# Filtering the words frequencies for each month to get the frequency of word monsoon
for month,value in freq_month.items():
    print(month, value.filter(lambda x : x[0] == 'monsoon').collect())

print('Total frequency of word monsoon in all months is :',freq_data.filter(lambda x: x[0]=='monsoon').collect())

1 [('monsoon', 42)]
2 [('monsoon', 52)]
3 [('monsoon', 81)]
4 [('monsoon', 121)]
5 [('monsoon', 334)]
6 [('monsoon', 934)]
7 [('monsoon', 904)]
8 [('monsoon', 505)]
9 [('monsoon', 410)]
10 [('monsoon', 252)]
11 [('monsoon', 113)]
12 [('monsoon', 59)]
Total frequency of word monsoon in all months is : [('monsoon', 3807)]
