In [None]:
 import pandas as pd
 import numpy as np


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

In [None]:
# reading stopwords
stopwords = sc.textFile('/content/stopwords.txt')

In [None]:
stopwords_list = stopwords.collect()

In [None]:
# Read all file names in the folder

In [None]:
path = '/content/'

In [None]:
file_list = os.listdir(path)

In [None]:
for i in file_list:
  if '.txt.txt' not in i:
    file_list.remove(i)

In [None]:
file_list.sort()

In [None]:
# Function to read all the text files simultaneously and create a dictionary
# Version with lambda function

def word_dict_lambda(path,file_list,stopwords_list):
  dict_words = {}
  for i in file_list:
    file_num = i.split('.')[0]
    file_read = sc.textFile(path+i)
    file_lower = file_read.flatMap(lambda x:x.lower().split())
    file_filter = file_lower.filter(lambda x:x not in stopwords_list)
    file_map = file_filter.map(lambda x:(x,1))
    file_reduce = file_map.reduceByKey(lambda x,y:x+y)
    reduce_list=file_reduce.collect()
    for i in reduce_list:
      if i[0] not in dict_words:
        dict_words[i[0]] = [(file_num,i[1])]
      else:
        dict_words[i[0]].append((file_num,i[1]))

  return dict_words


# Version with without lambda function

def word_dict_nolambda(path,file_list,stopwords_list):
  def lower_split(x):
    return x.lower().split()
  def filter_words(x):
    if x not in stopwords_list:
      return x
  def mapping(x):
    return (x,1)
  def frequency(x,y):
    return x+y

  dict_words = {}
  for i in file_list:
    file_num = i.split('.')[0]
    file_read = sc.textFile(path+i)
    file_lower = file_read.flatMap(lower_split)
    file_filter = file_lower.filter(filter_words)
    file_map = file_filter.map(mapping)
    file_reduce = file_map.reduceByKey(frequency)
    reduce_list=file_reduce.collect()
    for i in reduce_list:
      if i[0] not in dict_words:
        dict_words[i[0]] = [(file_num,i[1])]
      else:
        dict_words[i[0]].append((file_num,i[1]))

  return dict_words

In [None]:
dict_words = word_dict_lambda(path, file_list, stopwords_list)

In [None]:
dict_words

{'10': [('01', 1), ('08', 1), ('20', 1)],
 'days,': [('01', 1)],
 "nasa's": [('01', 1)],
 'cassini': [('01', 2), ('09', 3)],
 'step': [('01', 1), ('06', 1)],
 'months-long': [('01', 1)],
 'rings': [('01', 1)],
 'scientists': [('01', 1), ('04', 1), ('14', 1)],
 'unprecedented': [('01', 1)],
 'sixth': [('01', 1)],
 'sun.': [('01', 1)],
 'end': [('01', 2), ('06', 1), ('10', 1)],
 'mission': [('01', 1)],
 'opened': [('01', 1)],
 'eyes': [('01', 1)],
 'two': [('01', 1)],
 'worlds': [('01', 1)],
 'home': [('01', 1), ('10', 1), ('16', 1)],
 'alien': [('01', 1)],
 'moons': [('01', 1), ('09', 1)],
 'titan': [('01', 1)],
 'really': [('01', 1)],
 'era.': [('01', 1)],
 'fans': [('01', 1)],
 'devastated.': [('01', 1)],
 'spacecraft': [('01', 1)],
 'will': [('01', 1),
  ('02', 1),
  ('03', 2),
  ('07', 1),
  ('08', 1),
  ('14', 2),
  ('16', 1),
  ('19', 1),
  ('20', 1)],
 'nose-dive': [('01', 1)],
 'saturn': [('01', 2), ('09', 1)],
 'burn': [('01', 1)],
 "planet's": [('01', 1), ('09', 1)],
 'atmosph

In [None]:
# search words

def search(a,b,c):
  words_list = list(a.lower().split(' '))
  words_p = sc.parallelize(words_list)
  words_filter = words_p.filter(lambda x: x not in c).collect()


  word_list = {}
  for i in words_filter:
    word_list[i] = b[i]

  joint_frequencies ={}
  for i in word_list.values():
    for x,y in i:
      if x not in joint_frequencies.keys():
        joint_frequencies[x] = y
      else:
        joint_frequencies[x]+=y

  freq = sorted(joint_frequencies.items(), key = lambda i: i[1], reverse = True)

  return {a : freq}


In [None]:
print(search(input('Please input words seperated by space: '),dict_words,stopwords_list))

Please input words seperated by space: many years
{'many years': [('16', 2), ('08', 2), ('14', 2), ('18', 2), ('05', 1), ('07', 1), ('12', 1), ('09', 1), ('15', 1)]}
