Setting up the environment

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

Setting the environment variables

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
os.environ["PYSPARK_SUBMIT_ARGS"]="--master local[2] pyspark-shell"

Initializing SparkContext

In [0]:
import findspark
findspark.init("spark-2.4.5-bin-hadoop2.7")# SPARK_HOME
from pyspark.sql import SparkSession
from pyspark import SparkContext
#spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = SparkContext().getOrCreate()

Importing text file newdata.txt

In [0]:
lines = sc.textFile('newdata.txt')
lines.collect()

['1.414213562373095048801688724209698078569671875376948073176679737990732478462107038850387534327641572735013846230912297024924836055850737212644121497099935831413222665927505592755799950501152782060571470109559971605970274534596862014728517418640889198609552329230484308714321450839762603627995251407989687253396546331808829640620615258352395054745750287759961729835575220337531857011354374603408498847160386899970699004150305440277903164542478230684929369186215805784631115966687130130156185689872372352885092648612494977154218334204285686060146824720771435854874155657069677653720226485447015858801620758474922657226002085584466521458398893944370926591800311388246468157082630100594858704003186480342194897278290641045072636881313739855256117322040245091227700226941127573627280495738108967504018369868368450725799364729060762996941380475654823728997180326802474420629269124859052181004459842150591120249441341728531478105803603371077309182869314710171111683916581726889419758716582152128229518488

Calculating Bigrams

In [0]:
bigrams = lines.map(lambda line: line.strip().replace(".","")) \
                   .flatMap(lambda xs: (tuple(x) for x in zip(xs, xs[1:])))

bigrams.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)


PythonRDD[24] at RDD at PythonRDD.scala:53

Printing Bigrams

In [0]:
bigrams.collect()

[('1', '4'),
 ('4', '1'),
 ('1', '4'),
 ('4', '2'),
 ('2', '1'),
 ('1', '3'),
 ('3', '5'),
 ('5', '6'),
 ('6', '2'),
 ('2', '3'),
 ('3', '7'),
 ('7', '3'),
 ('3', '0'),
 ('0', '9'),
 ('9', '5'),
 ('5', '0'),
 ('0', '4'),
 ('4', '8'),
 ('8', '8'),
 ('8', '0'),
 ('0', '1'),
 ('1', '6'),
 ('6', '8'),
 ('8', '8'),
 ('8', '7'),
 ('7', '2'),
 ('2', '4'),
 ('4', '2'),
 ('2', '0'),
 ('0', '9'),
 ('9', '6'),
 ('6', '9'),
 ('9', '8'),
 ('8', '0'),
 ('0', '7'),
 ('7', '8'),
 ('8', '5'),
 ('5', '6'),
 ('6', '9'),
 ('9', '6'),
 ('6', '7'),
 ('7', '1'),
 ('1', '8'),
 ('8', '7'),
 ('7', '5'),
 ('5', '3'),
 ('3', '7'),
 ('7', '6'),
 ('6', '9'),
 ('9', '4'),
 ('4', '8'),
 ('8', '0'),
 ('0', '7'),
 ('7', '3'),
 ('3', '1'),
 ('1', '7'),
 ('7', '6'),
 ('6', '6'),
 ('6', '7'),
 ('7', '9'),
 ('9', '7'),
 ('7', '3'),
 ('3', '7'),
 ('7', '9'),
 ('9', '9'),
 ('9', '0'),
 ('0', '7'),
 ('7', '3'),
 ('3', '2'),
 ('2', '4'),
 ('4', '7'),
 ('7', '8'),
 ('8', '4'),
 ('4', '6'),
 ('6', '2'),
 ('2', '1'),
 ('1', '0'),

Counting Bigram occurence

In [0]:
counts = bigrams.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
counts.collect()

[(('1', '4'), 14),
 (('4', '1'), 13),
 (('3', '5'), 8),
 (('5', '6'), 8),
 (('6', '2'), 12),
 (('2', '3'), 9),
 (('3', '7'), 12),
 (('7', '3'), 11),
 (('0', '9'), 10),
 (('0', '4'), 11),
 (('4', '8'), 13),
 (('8', '8'), 13),
 (('8', '0'), 13),
 (('0', '1'), 14),
 (('7', '2'), 19),
 (('9', '8'), 10),
 (('6', '7'), 4),
 (('1', '8'), 14),
 (('7', '5'), 13),
 (('5', '3'), 7),
 (('7', '6'), 6),
 (('9', '4'), 9),
 (('6', '6'), 5),
 (('9', '9'), 14),
 (('9', '0'), 6),
 (('3', '2'), 7),
 (('8', '4'), 13),
 (('1', '0'), 9),
 (('2', '7'), 12),
 (('5', '7'), 13),
 (('9', '1'), 9),
 (('2', '2'), 11),
 (('4', '9'), 9),
 (('3', '6'), 9),
 (('5', '5'), 9),
 (('2', '6'), 14),
 (('4', '4'), 8),
 (('6', '5'), 10),
 (('1', '1'), 11),
 (('5', '2'), 11),
 (('4', '0'), 8),
 (('0', '8'), 8),
 (('8', '9'), 9),
 (('1', '9'), 3),
 (('2', '5'), 5),
 (('3', '3'), 5),
 (('6', '3'), 4),
 (('7', '7'), 7),
 (('0', '0'), 7),
 (('8', '1'), 6),
 (('4', '2'), 9),
 (('2', '1'), 13),
 (('1', '3'), 11),
 (('3', '0'), 10),
 

Sorting based on occurences 

In [0]:
count_sorted = counts.sortBy(lambda a: a[1])
count_sorted.collect()

[(('1', '9'), 3),
 (('5', '1'), 3),
 (('6', '7'), 4),
 (('6', '3'), 4),
 (('6', '6'), 5),
 (('2', '5'), 5),
 (('3', '3'), 5),
 (('9', '3'), 5),
 (('6', '1'), 5),
 (('7', '6'), 6),
 (('9', '0'), 6),
 (('8', '1'), 6),
 (('3', '4'), 6),
 (('4', '3'), 6),
 (('5', '3'), 7),
 (('3', '2'), 7),
 (('7', '7'), 7),
 (('0', '0'), 7),
 (('7', '8'), 7),
 (('1', '7'), 7),
 (('7', '9'), 7),
 (('3', '8'), 7),
 (('7', '4'), 7),
 (('3', '9'), 7),
 (('3', '5'), 8),
 (('5', '6'), 8),
 (('4', '4'), 8),
 (('4', '0'), 8),
 (('0', '8'), 8),
 (('9', '5'), 8),
 (('1', '6'), 8),
 (('2', '3'), 9),
 (('9', '4'), 9),
 (('1', '0'), 9),
 (('9', '1'), 9),
 (('4', '9'), 9),
 (('3', '6'), 9),
 (('5', '5'), 9),
 (('8', '9'), 9),
 (('4', '2'), 9),
 (('4', '6'), 9),
 (('1', '2'), 9),
 (('9', '2'), 9),
 (('2', '8'), 9),
 (('5', '4'), 9),
 (('0', '9'), 10),
 (('9', '8'), 10),
 (('6', '5'), 10),
 (('3', '0'), 10),
 (('2', '9'), 10),
 (('0', '2'), 10),
 (('8', '3'), 10),
 (('0', '6'), 10),
 (('4', '5'), 10),
 (('8', '6'), 10),


The least occuring bigrams are (1, 9) and (5, 1). They occur 3 times.


The most occuring bigram is (7,2). It occurs 19 times.