## Import Modules

In [1]:
import getpass as gp
from pyspark.sql import SparkSession

In [2]:
user = gp.getuser()
spark = SparkSession \
            .builder \
            .appName(f'{user}-campaing-data-analysis') \
            .getOrCreate()

In [3]:
spark

### Copy the files from local to hdfs

In [25]:
!hadoop fs -ls

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-05-01 05:47 .sparkStaging


In [26]:
!hadoop fs -mkdir -p data/input

In [27]:
!hadoop fs -ls data/

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-05-01 06:03 data/input


In [28]:
!hadoop fs -copyFromLocal /data/trendytech/google-ads-data.csv data/input

In [29]:
!hadoop fs -ls data/input

Found 1 items
-rw-r--r--   3 itv005077 supergroup      95947 2023-05-01 06:04 data/input/google-ads-data.csv


### Read from Local File System

In [30]:
# for local file system prefix the file path with "file://"
INPUT_FILE_NAME = f'/user/{user}/data/input/google-ads-data.csv'

In [31]:
rdd_file_load = spark.sparkContext.textFile(INPUT_FILE_NAME)

In [32]:
rdd_file_load.take(5)

['big data contents,Broad match,None,TrendyTech Search India,Broad Match #3,1,1,100%,INR,24.06,24.06,0,0,0%,Search',
 'spark training with lab access,Broad match,None,TrendyTech Search India,Broad Match #3,1,2,200%,INR,29.97,59.94,0,0,0%,Search',
 'online hadoop training institutes in hyderabad,Broad match,None,TrendyTech Search India,Broad Match #3,1,1,100%,INR,28.45,28.45,0,0,0%,Search',
 'coursera data analytics,Broad match,None,TrendyTech Search India,Broad Match #3,1,1,100%,INR,24.64,24.64,0,0,0%,Search',
 'ameerpet big data training cost,Broad match,None,TrendyTech Search India,Broad Match #3,2,1,50%,INR,34.86,34.86,0,0,0%,Search']

In [34]:
rdd_load_file_split = rdd_file_load.map(lambda x : x.split(','))

In [35]:
rdd_load_file_split.take(2)

[['big data contents',
  'Broad match',
  'None',
  'TrendyTech Search India',
  'Broad Match #3',
  '1',
  '1',
  '100%',
  'INR',
  '24.06',
  '24.06',
  '0',
  '0',
  '0%',
  'Search'],
 ['spark training with lab access',
  'Broad match',
  'None',
  'TrendyTech Search India',
  'Broad Match #3',
  '1',
  '2',
  '200%',
  'INR',
  '29.97',
  '59.94',
  '0',
  '0',
  '0%',
  'Search']]

In [45]:
rdd_split_col_rev = rdd_load_file_split.map(lambda x : (float(x[10]), x[0].lower()))

In [46]:
rdd_split_col_rev.take(5)

[(24.06, 'big data contents'),
 (59.94, 'spark training with lab access'),
 (28.45, 'online hadoop training institutes in hyderabad'),
 (24.64, 'coursera data analytics'),
 (34.86, 'ameerpet big data training cost')]

In [47]:
# flatMapValue() - flat the value 
rdd_value_flat_map = rdd_split_col_rev.flatMapValues(lambda x : x.split())

In [48]:
rdd_value_flat_map.take(10)

[(24.06, 'big'),
 (24.06, 'data'),
 (24.06, 'contents'),
 (59.94, 'spark'),
 (59.94, 'training'),
 (59.94, 'with'),
 (59.94, 'lab'),
 (59.94, 'access'),
 (28.45, 'online'),
 (28.45, 'hadoop')]

In [49]:
rdd_orig_col_order = rdd_value_flat_map.map(lambda x : (x[1], x[0]))

In [50]:
rdd_orig_col_order.take(10)

[('big', 24.06),
 ('data', 24.06),
 ('contents', 24.06),
 ('spark', 59.94),
 ('training', 59.94),
 ('with', 59.94),
 ('lab', 59.94),
 ('access', 59.94),
 ('online', 28.45),
 ('hadoop', 28.45)]

In [51]:
rdd_sum_amount = rdd_orig_col_order.reduceByKey(lambda x,y: float(x+y))

In [52]:
rdd_sum_amount.take(10)

[('contents', 24.06),
 ('training', 4099.37),
 ('lab', 135.57999999999998),
 ('online', 3484.42),
 ('hadoop', 4818.34),
 ('institutes', 437.14000000000004),
 ('in', 5774.84),
 ('analytics', 1458.5099999999998),
 ('ameerpet', 184.94),
 ('good', 83.86)]

In [55]:
rdd_sort_top_spend = rdd_sum_amount.sortBy(lambda x : x[1], False)

In [56]:
rdd_sort_top_spend.take(20)

[('data', 16394.64),
 ('big', 12889.279999999999),
 ('in', 5774.84),
 ('hadoop', 4818.34),
 ('course', 4191.59),
 ('training', 4099.37),
 ('online', 3484.42),
 ('courses', 2565.7800000000007),
 ('intellipaat', 2081.22),
 ('analytics', 1458.5099999999998),
 ('tutorial', 1383.37),
 ('hyderabad', 1118.1600000000003),
 ('spark', 1078.72),
 ('best', 1047.7),
 ('bangalore', 1039.2699999999998),
 ('and', 985.8),
 ('certification', 967.44),
 ('for', 967.05),
 ('of', 871.4199999999998),
 ('to', 848.3299999999999)]

In [57]:
!hadoop fs -rm -r data

2023-05-01 06:15:41,452 INFO fs.TrashPolicyDefault: Moved: 'hdfs://m01.itversity.com:9000/user/itv005077/data' to trash at: hdfs://m01.itversity.com:9000/user/itv005077/.Trash/Current/user/itv005077/data1682936141425


In [58]:
!hadoop fs -ls

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-05-01 05:47 .sparkStaging


In [59]:
spark.stop()

In [60]:
# end of file