## <b><u>Import the Modules

In [1]:
import getpass as gp
from pyspark.sql import SparkSession

In [2]:
user_name = gp.getuser()
spark = SparkSession.builder \
    .appName(f'{user_name}-word-count-program') \
    .master('yarn') \
    .getOrCreate()

In [3]:
spark

In [4]:
sc = spark.sparkContext

In [5]:
sc

### Copy Files:

* From /data/trendytech to local home directory
* From local home directory to hdfs home directory

In [6]:
!ls -l ~/

total 4
drwxr-xr-x 3 itv005077 students 4096 Apr 29 05:39 notebooks


In [7]:
!mkdir -p ~/data/trendytech

In [8]:
!ls -l ~/data/trendytech

total 0


In [9]:
!cp -r /data/trendytech ~/data

In [10]:
!ls -l ~/data/trendytech

total 415600
-rw-r--r-- 1 itv005077 students 365001114 Apr 29 05:40 bigLog.txt
-rw-r--r-- 1 itv005077 students     79180 Apr 29 05:40 boringwords.txt
-rw-r--r-- 1 itv005077 students    136855 Apr 29 05:40 customer-orders.csv
-rw-r--r-- 1 itv005077 students      8254 Apr 29 05:40 friends-data.csv
-rw-r--r-- 1 itv005077 students     95947 Apr 29 05:40 google-ads-data.csv
-rw-r--r-- 1 itv005077 students      5812 Apr 29 05:40 kv1.txt
drwxr-xr-x 2 itv005077 students      4096 Apr 29 05:40 mapreduce_jars
-rw-r--r-- 1 itv005077 students       159 Apr 29 05:40 samplefile.txt
-rw-r--r-- 1 itv005077 students     18322 Apr 29 05:40 search_data.txt
-rwxr-xr-x 1 itv005077 students  60201900 Apr 29 05:40 students.csv


In [11]:
!ls -l /home/itv005077/data/trendytech/

total 415600
-rw-r--r-- 1 itv005077 students 365001114 Apr 29 05:40 bigLog.txt
-rw-r--r-- 1 itv005077 students     79180 Apr 29 05:40 boringwords.txt
-rw-r--r-- 1 itv005077 students    136855 Apr 29 05:40 customer-orders.csv
-rw-r--r-- 1 itv005077 students      8254 Apr 29 05:40 friends-data.csv
-rw-r--r-- 1 itv005077 students     95947 Apr 29 05:40 google-ads-data.csv
-rw-r--r-- 1 itv005077 students      5812 Apr 29 05:40 kv1.txt
drwxr-xr-x 2 itv005077 students      4096 Apr 29 05:40 mapreduce_jars
-rw-r--r-- 1 itv005077 students       159 Apr 29 05:40 samplefile.txt
-rw-r--r-- 1 itv005077 students     18322 Apr 29 05:40 search_data.txt
-rwxr-xr-x 1 itv005077 students  60201900 Apr 29 05:40 students.csv


In [12]:
!hadoop fs -ls 

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 05:39 .sparkStaging


In [13]:
!hadoop fs -mkdir -p data/trendytech

In [14]:
!hadoop fs -ls data/trendytech

In [15]:
!hadoop fs -copyFromLocal ~/data/trendytech/search_data.txt data/trendytech

In [16]:
!hadoop fs -ls data/trendytech

Found 1 items
-rw-r--r--   3 itv005077 supergroup      18322 2023-04-29 05:40 data/trendytech/search_data.txt


## Create RDD's

In [17]:
FILE_PATH = f'/user/{user_name}/data'

In [18]:
rdd_file_input = sc.textFile(f'{FILE_PATH}/trendytech/search_data.txt')

#### quick check the file input rdd format

In [19]:
rdd_file_input.take(5)

['best big data hadoop spark training',
 'kelly technologies courses',
 'intellepaat',
 'cloudera training partner india',
 'microsoft certification courses in big data']

### split the items in list based on whitespace

e.g. Item[0] = 'best big data hadoop spark training' would be split into a list ['best', 'big', 'data', 'hadoop', 'spark', 'training']

In [20]:
rdd_split_file_input = rdd_file_input.map(lambda x : x.split())

In [21]:
rdd_split_file_input.take(5)

[['best', 'big', 'data', 'hadoop', 'spark', 'training'],
 ['kelly', 'technologies', 'courses'],
 ['intellepaat'],
 ['cloudera', 'training', 'partner', 'india'],
 ['microsoft', 'certification', 'courses', 'in', 'big', 'data']]

### Flattern the list of list -> list

In [22]:
rdd_flat_file_input = rdd_split_file_input.flatMap(lambda x: x)

In [23]:
rdd_flat_file_input.take(10)

['best',
 'big',
 'data',
 'hadoop',
 'spark',
 'training',
 'kelly',
 'technologies',
 'courses',
 'intellepaat']

### Instead of writing it in two transformation the same thing can be achived as below:

In [24]:
rdd_flat_file_input = rdd_file_input.flatMap(lambda x : x.split())

In [25]:
rdd_flat_file_input.take(10)

['best',
 'big',
 'data',
 'hadoop',
 'spark',
 'training',
 'kelly',
 'technologies',
 'courses',
 'intellepaat']

### Approach-1:

* using reduceByKey()

In [26]:
rdd_map_file_input = rdd_flat_file_input.map(lambda x : (x,1))

In [27]:
rdd_map_file_input.take(10)

[('best', 1),
 ('big', 1),
 ('data', 1),
 ('hadoop', 1),
 ('spark', 1),
 ('training', 1),
 ('kelly', 1),
 ('technologies', 1),
 ('courses', 1),
 ('intellepaat', 1)]

In [28]:
rdd_reduce_map_file_input = rdd_map_file_input.reduceByKey(lambda x,y : x+y)

In [29]:
rdd_reduce_map_file_input.take(10)

[('best', 29),
 ('hadoop', 96),
 ('training', 111),
 ('intellepaat', 1),
 ('microsoft', 3),
 ('in', 169),
 ('online', 58),
 ('preparation', 1),
 ('mining', 2),
 ('programs', 4)]

### Save the output to a file

In [30]:
rdd_reduce_map_file_input.saveAsTextFile(f'{FILE_PATH}/trendytech/output/word-count')

In [31]:
!hadoop fs -ls data/trendytech/output/word-count

Found 3 items
-rw-r--r--   3 itv005077 supergroup          0 2023-04-29 05:40 data/trendytech/output/word-count/_SUCCESS
-rw-r--r--   3 itv005077 supergroup       4687 2023-04-29 05:40 data/trendytech/output/word-count/part-00000
-rw-r--r--   3 itv005077 supergroup       5056 2023-04-29 05:40 data/trendytech/output/word-count/part-00001


### File explorartion for quick check

In [32]:
!hadoop fs -head data/trendytech/output/word-count/part-00000

('best', 29)
('hadoop', 96)
('training', 111)
('intellepaat', 1)
('microsoft', 3)
('in', 169)
('online', 58)
('preparation', 1)
('mining', 2)
('programs', 4)
('module', 1)
('1', 5)
('PYTHON', 2)
('delhi', 8)
('HADOOP', 4)
('analytics', 35)
('professional', 2)
('institute', 19)
('good', 2)
('questions', 3)
('contents', 1)
('b', 3)
('sc', 1)
('third', 1)
('class', 6)
('of', 27)
('master', 4)
('basics', 2)
('days', 2)
('is', 12)
('starting', 1)
('5www', 1)
('im', 1)
('cloudx', 2)
('lab', 4)
('credentials', 1)
('coarse', 1)
('ecosystems', 1)
('intoduction', 1)
('flair', 4)
('www', 4)
('intellipaat', 10)
('pass', 2)
('do', 2)
('hacking', 1)
('congitive', 1)
('sheet', 1)
('groups', 1)
('analyst', 11)
('python', 8)
('bigdata', 19)
('linkedin', 1)
('centres', 2)
('project', 3)
('title', 1)
('engineering', 9)
('coaching', 6)
('engineer', 9)
('eligibility', 3)
('scala', 10)
('chahie', 1)
('month', 1)
('2020', 5)
('20', 2)
('placement', 11)
('full', 4)
('stack', 1)
('thane', 1)
('congnitive', 1)


### Check the block information of the output file

In [33]:
!hdfs fsck /user/itv005077/data/trendytech/output/word-count/part-00001 -files -blocks -locations

Connecting to namenode via http://m01.itversity.com:9870/fsck?ugi=itv005077&files=1&blocks=1&locations=1&path=%2Fuser%2Fitv005077%2Fdata%2Ftrendytech%2Foutput%2Fword-count%2Fpart-00001
FSCK started by itv005077 (auth:SIMPLE) from /172.16.1.102 for path /user/itv005077/data/trendytech/output/word-count/part-00001 at Sat Apr 29 05:40:31 EDT 2023

/user/itv005077/data/trendytech/output/word-count/part-00001 5056 bytes, replicated: replication=3, 1 block(s):  OK
0. BP-1685381103-172.16.1.103-1609223169030:blk_1084778273_11045255 len=5056 Live_repl=3  [DatanodeInfoWithStorage[172.16.1.105:9866,DS-cd1d8ab0-7d77-4607-98bf-961a7ad81f45,DISK], DatanodeInfoWithStorage[172.16.1.107:9866,DS-53639da4-6786-42af-a4a6-5021150dddf3,DISK], DatanodeInfoWithStorage[172.16.1.106:9866,DS-3cdd1a86-1122-4b3f-9d9d-c9fe36cab433,DISK]]


Status: HEALTHY
 Number of data-nodes:	3
 Number of racks:		1
 Total dirs:			0
 Total symlinks:		0

Replicated Blocks:
 Total size:	5056 B
 Total files:	1
 Total blocks (validat

### Collect the rdd into a local variable and print it in a desired format
* collect() to capture the entire rdd into a local variable
* take(n) to capture n items from rdd into a local variable

In [34]:
output = rdd_reduce_map_file_input.take(10)

In [35]:
# tuple unpacking
for word,count in output:
    print(f"{word: >10}:{count: <5}")

      best:29   
    hadoop:96   
  training:111  
intellepaat:1    
 microsoft:3    
        in:169  
    online:58   
preparation:1    
    mining:2    
  programs:4    


### Approach-2:

* using countByValue() action which loads the entire rdd into a local variable.

** <b>May result in Out Of Memory Error **

In [36]:
rdd_sort_flat_file_input = rdd_flat_file_input.sortBy(lambda x : x)

In [37]:
rdd_sort_flat_file_input.take(10)

['*', '1', '1', '1', '1', '1', '10', '100', '1000', '101']

In [38]:
output = rdd_sort_flat_file_input.countByValue()

In [39]:
output

defaultdict(int,
            {'*': 1,
             '1': 5,
             '10': 1,
             '100': 1,
             '1000': 1,
             '101': 1,
             '12': 2,
             '13': 1,
             '175': 1,
             '1st': 1,
             '1ywar': 1,
             '2': 4,
             '2.4': 1,
             '20': 2,
             '2020': 5,
             '21': 1,
             '3': 3,
             '3000': 1,
             '4': 2,
             '5': 1,
             '5www': 1,
             'ARCHITECTURE': 1,
             'BANGALORE': 2,
             'BIG': 6,
             'BIGDATA': 1,
             'COURSE': 1,
             'DATA': 7,
             'DELHI': 1,
             'HADOOP': 4,
             'IN': 2,
             'MICROSOFT': 1,
             'PGP': 1,
             'PYTHON': 2,
             'SCALA': 1,
             'SIMPLILEARN': 1,
             'STUDENT': 1,
             'TRAINING': 3,
             'UDACITY': 1,
             '_': 2,
             'a': 5,
             'about

### <b><u> *** Remember to stop sparkContext and SparkSession ***

In [40]:
sc.stop()
spark.stop()

### Deleting working directories from both local and hdfs

In [41]:
!ls -l ~/

total 8
drwxr-xr-x 3 itv005077 students 4096 Apr 29 05:39 data
drwxr-xr-x 3 itv005077 students 4096 Apr 29 05:39 notebooks


In [42]:
!rm -r ~/data

In [43]:
!ls -l ~/

total 4
drwxr-xr-x 3 itv005077 students 4096 Apr 29 05:39 notebooks


In [44]:
!hadoop fs -ls 

Found 3 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 05:40 .sparkStaging
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 05:40 data


In [45]:
!hadoop fs -rm -r data

2023-04-29 05:40:36,868 INFO fs.TrashPolicyDefault: Moved: 'hdfs://m01.itversity.com:9000/user/itv005077/data' to trash at: hdfs://m01.itversity.com:9000/user/itv005077/.Trash/Current/user/itv005077/data1682761236852


In [46]:
!hadoop fs -ls

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 05:40 .sparkStaging
