## <i><u>*Import the Packages*

In [1]:
import getpass as gp
from pyspark.sql import SparkSession

In [2]:
user_name = gp.getuser()
spark = SparkSession.builder \
    .appName(f"{user_name}-word-count") \
    .master('yarn') \
    .getOrCreate()

In [3]:
spark

In [4]:
sc = spark.sparkContext

In [5]:
sc

### Copy Files:

* From /data/trendytech to local home directory
* From local home directory to hdfs home directory

In [6]:
!ls -l ~/

total 4
drwxr-xr-x 3 itv005077 students 4096 Apr 22 12:22 notebooks


In [7]:
!mkdir ~/data
!ls -l ~/

total 8
drwxr-xr-x 2 itv005077 students 4096 Apr 22 12:28 data
drwxr-xr-x 3 itv005077 students 4096 Apr 22 12:22 notebooks


In [8]:
!cp -r /data/trendytech ~/data
!ls -l ~/data

total 4
drwxr-xr-x 3 itv005077 students 4096 Apr 22 12:28 trendytech


In [9]:
!ls -l /home/itv005077/data/trendytech/

total 415600
-rw-r--r-- 1 itv005077 students 365001114 Apr 22 12:28 bigLog.txt
-rw-r--r-- 1 itv005077 students     79180 Apr 22 12:28 boringwords.txt
-rw-r--r-- 1 itv005077 students    136855 Apr 22 12:28 customer-orders.csv
-rw-r--r-- 1 itv005077 students      8254 Apr 22 12:28 friends-data.csv
-rw-r--r-- 1 itv005077 students     95947 Apr 22 12:28 google-ads-data.csv
-rw-r--r-- 1 itv005077 students      5812 Apr 22 12:28 kv1.txt
drwxr-xr-x 2 itv005077 students      4096 Apr 22 12:28 mapreduce_jars
-rw-r--r-- 1 itv005077 students       159 Apr 22 12:28 samplefile.txt
-rw-r--r-- 1 itv005077 students     18322 Apr 22 12:28 search_data.txt
-rwxr-xr-x 1 itv005077 students  60201900 Apr 22 12:28 students.csv


In [10]:
!hadoop fs -ls 

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-04-22 12:28 .sparkStaging


In [11]:
!hadoop fs -mkdir data
!hadoop fs -copyFromLocal ~/data/trendytech/search_data.txt data

In [12]:
!hadoop fs -ls data/

Found 1 items
-rw-r--r--   3 itv005077 supergroup      18322 2023-04-22 12:28 data/search_data.txt


## Create RDD's

In [13]:
file_input_rdd = sc.textFile('/user/itv005077/data/search_data.txt')

#### check the file input rdd format

In [14]:
file_input_rdd.take(5)

['best big data hadoop spark training',
 'kelly technologies courses',
 'intellepaat',
 'cloudera training partner india',
 'microsoft certification courses in big data']

### split the items in list based on whitespace

e.g. Item[0] = 'best big data hadoop spark training' would be split into a list ['best', 'big', 'data', 'hadoop', 'spark', 'training']

In [15]:
split_input_rdd = file_input_rdd.map(lambda x : x.split())

In [16]:
split_input_rdd.take(5)

[['best', 'big', 'data', 'hadoop', 'spark', 'training'],
 ['kelly', 'technologies', 'courses'],
 ['intellepaat'],
 ['cloudera', 'training', 'partner', 'india'],
 ['microsoft', 'certification', 'courses', 'in', 'big', 'data']]

### Flattern the list of list 

In [17]:
flattern_input_rdd = split_input_rdd.flatMap(lambda x: x)

In [18]:
flattern_input_rdd.take(10)

['best',
 'big',
 'data',
 'hadoop',
 'spark',
 'training',
 'kelly',
 'technologies',
 'courses',
 'intellepaat']

### Instead of writing it in two transformation the same thing can be achived as below:

In [19]:
mod_flattern_input_rdd = file_input_rdd.flatMap(lambda x : x.split())

In [20]:
mod_flattern_input_rdd.take(10)

['best',
 'big',
 'data',
 'hadoop',
 'spark',
 'training',
 'kelly',
 'technologies',
 'courses',
 'intellepaat']

### Approach-1:

* using shuffle, sort and reducer logic together by using reduceByKey()

In [21]:
reduce_map_input = mod_flattern_input_rdd.map(lambda x : (x,1))

In [22]:
reduce_map_input.take(10)

[('best', 1),
 ('big', 1),
 ('data', 1),
 ('hadoop', 1),
 ('spark', 1),
 ('training', 1),
 ('kelly', 1),
 ('technologies', 1),
 ('courses', 1),
 ('intellepaat', 1)]

In [23]:
reduce_map_out = reduce_map_input.reduceByKey(lambda x,y : x+y)

In [24]:
reduce_map_out.take(10)

[('best', 29),
 ('hadoop', 96),
 ('training', 111),
 ('intellepaat', 1),
 ('microsoft', 3),
 ('in', 169),
 ('online', 58),
 ('preparation', 1),
 ('mining', 2),
 ('programs', 4)]

In [25]:
output_var = reduce_map_out.collect()

In [26]:
for word,count in output_var:
    print(f"{word: >10}:{count: <5}")

      best:29   
    hadoop:96   
  training:111  
intellepaat:1    
 microsoft:3    
        in:169  
    online:58   
preparation:1    
    mining:2    
  programs:4    
    module:1    
         1:5    
    PYTHON:2    
     delhi:8    
    HADOOP:4    
 analytics:35   
professional:2    
 institute:19   
      good:2    
 questions:3    
  contents:1    
         b:3    
        sc:1    
     third:1    
     class:6    
        of:27   
    master:4    
    basics:2    
      days:2    
        is:12   
  starting:1    
      5www:1    
        im:1    
    cloudx:2    
       lab:4    
credentials:1    
    coarse:1    
ecosystems:1    
intoduction:1    
     flair:4    
       www:4    
intellipaat:10   
      pass:2    
        do:2    
   hacking:1    
 congitive:1    
     sheet:1    
    groups:1    
   analyst:11   
    python:8    
   bigdata:19   
  linkedin:1    
   centres:2    
   project:3    
     title:1    
engineering:9    
  coaching:6    
  engineer:9    
eligib

### Approach-2:

* using shuffle, sort first and then reduce logic differently.

In [27]:
sort_input_rdd = mod_flattern_input_rdd.sortBy(lambda x : x)

In [28]:
sort_input_rdd.take(10)

['*', '1', '1', '1', '1', '1', '10', '100', '1000', '101']

In [29]:
output_var1 = sort_input_rdd.countByValue()

In [30]:
output_var1.items()

dict_items([('*', 1), ('1', 5), ('10', 1), ('100', 1), ('1000', 1), ('101', 1), ('12', 2), ('13', 1), ('175', 1), ('1st', 1), ('1ywar', 1), ('2', 4), ('2.4', 1), ('20', 2), ('2020', 5), ('21', 1), ('3', 3), ('3000', 1), ('4', 2), ('5', 1), ('5www', 1), ('ARCHITECTURE', 1), ('BANGALORE', 2), ('BIG', 6), ('BIGDATA', 1), ('COURSE', 1), ('DATA', 7), ('DELHI', 1), ('HADOOP', 4), ('IN', 2), ('MICROSOFT', 1), ('PGP', 1), ('PYTHON', 2), ('SCALA', 1), ('SIMPLILEARN', 1), ('STUDENT', 1), ('TRAINING', 3), ('UDACITY', 1), ('_', 2), ('a', 5), ('about', 1), ('acadamy', 1), ('academy', 5), ('acadgild', 3), ('acadglid', 1), ('access', 1), ('acharya', 1), ('acknowledge', 1), ('action', 1), ('adminstartor', 1), ('advance', 1), ('affiliations', 1), ('after', 1), ('algorithm', 1), ('all', 1), ('am', 1), ('ameerpet', 6), ('amity', 1), ('amounts', 1), ('an', 3), ('analysis', 5), ('analyst', 11), ('analytic', 1), ('analytical', 1), ('analytics', 35), ('analytixlabs', 1), ('and', 22), ('anlyst', 1), ('answer'

In [31]:
for word, count in output_var1.items():
    print(f"{word: >10}:{count: <10}")

         *:1         
         1:5         
        10:1         
       100:1         
      1000:1         
       101:1         
        12:2         
        13:1         
       175:1         
       1st:1         
     1ywar:1         
         2:4         
       2.4:1         
        20:2         
      2020:5         
        21:1         
         3:3         
      3000:1         
         4:2         
         5:1         
      5www:1         
ARCHITECTURE:1         
 BANGALORE:2         
       BIG:6         
   BIGDATA:1         
    COURSE:1         
      DATA:7         
     DELHI:1         
    HADOOP:4         
        IN:2         
 MICROSOFT:1         
       PGP:1         
    PYTHON:2         
     SCALA:1         
SIMPLILEARN:1         
   STUDENT:1         
  TRAINING:3         
   UDACITY:1         
         _:2         
         a:5         
     about:1         
   acadamy:1         
   academy:5         
  acadgild:3         
  acadglid:1         
    acc

### Save the output to a file

In [32]:
reduce_map_out.saveAsTextFile('/user/itv005077/output/word-count')

In [33]:
!hadoop fs -ls output/word-count

Found 3 items
-rw-r--r--   3 itv005077 supergroup          0 2023-04-22 12:28 output/word-count/_SUCCESS
-rw-r--r--   3 itv005077 supergroup       4687 2023-04-22 12:28 output/word-count/part-00000
-rw-r--r--   3 itv005077 supergroup       5056 2023-04-22 12:28 output/word-count/part-00001


In [34]:
!hadoop fs -head output/word-count/part-00000

('best', 29)
('hadoop', 96)
('training', 111)
('intellepaat', 1)
('microsoft', 3)
('in', 169)
('online', 58)
('preparation', 1)
('mining', 2)
('programs', 4)
('module', 1)
('1', 5)
('PYTHON', 2)
('delhi', 8)
('HADOOP', 4)
('analytics', 35)
('professional', 2)
('institute', 19)
('good', 2)
('questions', 3)
('contents', 1)
('b', 3)
('sc', 1)
('third', 1)
('class', 6)
('of', 27)
('master', 4)
('basics', 2)
('days', 2)
('is', 12)
('starting', 1)
('5www', 1)
('im', 1)
('cloudx', 2)
('lab', 4)
('credentials', 1)
('coarse', 1)
('ecosystems', 1)
('intoduction', 1)
('flair', 4)
('www', 4)
('intellipaat', 10)
('pass', 2)
('do', 2)
('hacking', 1)
('congitive', 1)
('sheet', 1)
('groups', 1)
('analyst', 11)
('python', 8)
('bigdata', 19)
('linkedin', 1)
('centres', 2)
('project', 3)
('title', 1)
('engineering', 9)
('coaching', 6)
('engineer', 9)
('eligibility', 3)
('scala', 10)
('chahie', 1)
('month', 1)
('2020', 5)
('20', 2)
('placement', 11)
('full', 4)
('stack', 1)
('thane', 1)
('congnitive', 1)


### <b><u> ***Remember to stop sparkContext and SparkSession ***

In [35]:
sc.stop()
spark.stop()