In [1]:
!ls -l ~/

total 4
drwxr-xr-x 3 itv005077 students 4096 Apr 29 05:40 notebooks


In [3]:
!mkdir -p ~/data/input

In [4]:
!ls -l ~/data

total 4
drwxr-xr-x 2 itv005077 students 4096 Apr 29 05:42 input


### Go to terminal:
1. cd ~/data/input
2. vi linkedin_views.csv
3. press 'i'
4. paste 
1,Manasa,Sumit
2,Deepa,Sumit
3,Sumit,Manasa
4,Manasa,Deepa
5,Deepa,Manasa
6,Shilpy,Manasa
5. press ':wq!'


In [6]:
!ls ~/data/input/linkedin_views.csv

/home/itv005077/data/input/linkedin_views.csv


In [7]:
!cat ~/data/input/linkedin_views.csv

1,Manasa,Sumit
2,Deepa,Sumit
3,Sumit,Manasa
4,Manasa,Deepa
5,Deepa,Manasa
6,Shilpy,Manasa


In [8]:
!hadoop fs -ls

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 05:40 .sparkStaging


In [9]:
!hadoop fs -mkdir -p data/input

In [10]:
!hadoop fs -ls data

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 06:02 data/input


In [11]:
!hadoop fs -copyFromLocal ~/data/input/linkedin_views.csv data/input

In [12]:
!hadoop fs -ls data/input

Found 1 items
-rw-r--r--   3 itv005077 supergroup         90 2023-04-29 06:03 data/input/linkedin_views.csv


## Find how many times each person's profile is viewed 

In [13]:
import getpass as gp
from pyspark.sql import SparkSession

In [14]:
user_name = gp.getuser()
spark = SparkSession.builder \
    .appName(f'{user_name}-linkedin-view-program') \
    .master('yarn') \
    .getOrCreate()

In [15]:
spark

In [16]:
sc = spark.sparkContext

In [17]:
sc

### Create RDD's

In [19]:
FILE_PATH = f'/user/{user_name}/data'

In [20]:
rdd_file_read = sc.textFile(f'{FILE_PATH}/input/linkedin_views.csv')

In [21]:
rdd_file_read.take(5)

['1,Manasa,Sumit',
 '2,Deepa,Sumit',
 '3,Sumit,Manasa',
 '4,Manasa,Deepa',
 '5,Deepa,Manasa']

In [22]:
rdd_split_file_read = rdd_file_read.map(lambda row : row.split(','))

In [23]:
rdd_split_file_read.take(5)

[['1', 'Manasa', 'Sumit'],
 ['2', 'Deepa', 'Sumit'],
 ['3', 'Sumit', 'Manasa'],
 ['4', 'Manasa', 'Deepa'],
 ['5', 'Deepa', 'Manasa']]

In [24]:
rdd_profile_file_read = rdd_split_file_read.map(lambda col : col[2])

In [25]:
rdd_profile_file_read.take(5)

['Sumit', 'Sumit', 'Manasa', 'Deepa', 'Manasa']

In [27]:
output = rdd_profile_file_read.countByValue()

In [28]:
output

defaultdict(int, {'Sumit': 2, 'Manasa': 3, 'Deepa': 1})

In [29]:
for profile, no_views in output.items():
    print(f'{profile}\'s profile has been viewed {no_views} times')

Sumit's profile has been viewed 2 times
Manasa's profile has been viewed 3 times
Deepa's profile has been viewed 1 times


### alternate approach

In [30]:
rdd_individual_view = rdd_profile_file_read.map(lambda profile : (profile,1))

In [31]:
rdd_individual_view.take(5)

[('Sumit', 1), ('Sumit', 1), ('Manasa', 1), ('Deepa', 1), ('Manasa', 1)]

In [32]:
rdd_reduce_individual_view = rdd_individual_view.reduceByKey(lambda x,y : x+y)

In [33]:
rdd_reduce_individual_view.take(5)

[('Sumit', 2), ('Deepa', 1), ('Manasa', 3)]

In [34]:
output = rdd_reduce_individual_view.collect()

In [36]:
output

[('Sumit', 2), ('Deepa', 1), ('Manasa', 3)]

In [37]:
for profile, no_views in output:
    print(f'{profile}\'s profile has been viewed {no_views} times')

Sumit's profile has been viewed 2 times
Deepa's profile has been viewed 1 times
Manasa's profile has been viewed 3 times


In [38]:
sc.stop()
spark.stop()

### Deleting working directories from both local and hdfs

In [39]:
!ls -l ~/data

total 4
drwxr-xr-x 2 itv005077 students 4096 Apr 29 06:01 input


In [40]:
!rm -r ~/data

In [42]:
!ls -l ~/

total 4
drwxr-xr-x 3 itv005077 students 4096 Apr 29 06:17 notebooks


In [43]:
!hadoop fs -ls 

Found 3 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 06:18 .sparkStaging
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 06:02 data


In [44]:
!hadoop fs -rm -r data

2023-04-29 06:19:49,657 INFO fs.TrashPolicyDefault: Moved: 'hdfs://m01.itversity.com:9000/user/itv005077/data' to trash at: hdfs://m01.itversity.com:9000/user/itv005077/.Trash/Current/user/itv005077/data1682763589631


In [45]:
!hadoop fs -ls

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-04-29 06:18 .sparkStaging
