In [1]:
# start-dfs starts hadoop nodes
# for it to work properly, Hadoop must be setup correctly

!start-dfs

# Question 1

In [2]:
# -mkdir -> creates folder if does not exist

!hdfs dfs -mkdir /myfiles

mkdir: `/myfiles': File exists


In [3]:
# -ls -> lists contents of directory, / means root directory
# use it to see that folder is created correctly

!hdfs dfs -ls /

Found 1 items
drwxr-xr-x   - ilyas supergroup          0 2022-08-23 15:40 /myfiles


# Question 2.A

In [4]:
# -touch -> create file in hdfs, type full path of file that you want to create

!hdfs dfs -touch /myfiles/any_file.py

In [5]:
!hdfs dfs -ls /myfiles

Found 2 items
-rw-r--r--   1 ilyas supergroup          0 2022-08-23 15:45 /myfiles/any_file.py
-rw-r--r--   1 ilyas supergroup          0 2022-08-23 15:40 /myfiles/any_file.r


# Question 2.B

In [6]:
# -rm -> removes file

!hdfs dfs -rm /myfiles/any_file.py

Deleted /myfiles/any_file.py


In [7]:
!hdfs dfs -ls /myfiles

Found 1 items
-rw-r--r--   1 ilyas supergroup          0 2022-08-23 15:40 /myfiles/any_file.r


# Question 3

In [8]:
# write your job class definition in a string
# so that it can be moved to a seperate file
# Reason is that jobs can only be run from another files

job_as_str = """

from mrjob.job import MRJob

class MRExtreme(MRJob):
    def mapper(self, keys_, line):
        comma_sep = line.split(",")
        gender = comma_sep[1]
        gender = gender[0]
        space_sep = line.split()
        n_project = space_sep[1]
        yield gender, int(n_project)

class MRMax(MRExtreme):
    def reducer(self, keys_, values):
        yield keys_, ("Max: ", max(values))

class MRMin(MRExtreme):
    def reducer(self, keys_, values):
        yield keys_, ("Min: ", min(values))

if __name__ == "__main__":
    MRMax().run()
    MRMin().run()
        
"""

In [9]:
# put job string to python file so that it can be run

with open("MRExtreme.py", "w") as f:
    print(job_as_str, file=f)

In [36]:
# now, job definition is in seperate .py file,
# it can be run by giving projects.txt as input

!python MRExtreme.py projects.txt

"f"	["Max: ", 8]
"m"	["Max: ", 9]
"f"	["Min: ", 2]
"m"	["Min: ", 0]


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\ilyas\AppData\Local\Temp\MRExtreme.ilyas.20220823.113355.216414
Running step 1 of 1...
job output is in C:\Users\ilyas\AppData\Local\Temp\MRExtreme.ilyas.20220823.113355.216414\output
Streaming final output from C:\Users\ilyas\AppData\Local\Temp\MRExtreme.ilyas.20220823.113355.216414\output...
Removing temp directory C:\Users\ilyas\AppData\Local\Temp\MRExtreme.ilyas.20220823.113355.216414...
No configs found; falling back on auto-configuration
No configs found; falling back on auto-configuration
No configs specified for inline runner
No configs specified for inline runner
Creating temp directory C:\Users\ilyas\AppData\Local\Temp\MRExtreme.ilyas.20220823.113355.411755
Creating temp directory C:\Users\ilyas\AppData\Local\Temp\MRExtreme.ilyas.20220823.113355.411755
Running step 1 of 1...
Running step 1 of 1...
job output is in C:\Users\ilyas\AppData\Local\Temp\MRExt

# Question 4

In [10]:
job_as_str = """

import mrjob.job as job

class MRAverager(job.MRJob):

    def mapper(self, keys, line):
        comma_sep = line.split(",")
        tab_sep = line.split()
        city = comma_sep[1][:-3]
        n_people = int(tab_sep[1])
        yield city, (n_people, 1)

    def reducer(self, keys, values):
        sum_n_people = 0
        sum_count = 0
        for n_people, count in values:
            sum_n_people += n_people
            sum_count += count
        yield keys, sum_n_people / sum_count


if __name__ == "__main__":
    MRAverager().run()

"""

In [11]:
with open("MRAverager.py", "w") as f:
    print(job_as_str, file=f)

In [12]:
# give demography.txt as input

!python MRAverager.py demography.txt

"Baku"	27.0
"Ganja"	26.6
"Sumgayit"	31.8


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\ilyas\AppData\Local\Temp\MRAverager.ilyas.20220823.114623.456658
Running step 1 of 1...
job output is in C:\Users\ilyas\AppData\Local\Temp\MRAverager.ilyas.20220823.114623.456658\output
Streaming final output from C:\Users\ilyas\AppData\Local\Temp\MRAverager.ilyas.20220823.114623.456658\output...
Removing temp directory C:\Users\ilyas\AppData\Local\Temp\MRAverager.ilyas.20220823.114623.456658...


# Question 5

In [13]:
# create r file in myfiles with "-touch"

!hdfs dfs -touch /myfiles/any_file.r

In [14]:
!hdfs dfs -ls /myfiles

Found 1 items
-rw-r--r--   1 ilyas supergroup          0 2022-08-23 15:46 /myfiles/any_file.r


In [15]:
# -copyToLocal copies file in HDFS to local
# first argument to it is path of file in HDFS: "/myfiles/any_file.r"
# second argument is destination in local to copy the file: "." means current directory

!hdfs dfs -copyToLocal /myfiles/any_file.r .

copyToLocal: `any_file.r': File exists


In [16]:
# to see that file is in current directory
!dir

 Volume in drive C has no label.
 Volume Serial Number is 9C99-77C9

 Directory of C:\Users\ilyas\Desktop\test

23/08/2022  15:45    <DIR>          .
23/08/2022  15:45    <DIR>          ..
23/08/2022  15:03    <DIR>          .ipynb_checkpoints
23/08/2022  15:42                 0 any_file.r
23/08/2022  15:45            11,887 bd10_cs.ipynb
23/08/2022  14:18            12,733 bd12_cs.ipynb
23/08/2022  15:39               291 demography.txt
23/08/2022  15:46               577 MRAverager.py
23/08/2022  15:46               600 MRExtreme.py
23/08/2022  15:14               194 projects.txt
               7 File(s)         26,282 bytes
               3 Dir(s)  41,520,660,480 bytes free
