![image](https://user-images.githubusercontent.com/57321948/196933065-4b16c235-f3b9-4391-9cfe-4affcec87c35.png)

# Submitted by: Mohammad Wasiq

## Email: `gl0427@myamu.ac.in`

# Pre-Placement Training Assignment - `Big Data` 

## Hadoop

**Q1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.**

**Ans :** 

In [None]:
import configparser

# Read the Hadoop configuration file
config = configparser.ConfigParser()
config.read('hadoop.conf')

# Display the core components of Hadoop
core_components = config['core-site']['fs.defaultFS']
print(f"fs.defaultFS: {core_components}")

hdfs_components = config['hdfs-site']['dfs.nameservices']
print(f"dfs.nameservices: {hdfs_components}")

yarn_components = config['yarn-site']['yarn.resourcemanager.hostname']
print(f"yarn.resourcemanager.hostname: {yarn_components}")

**Q2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.**

**Ans :**

In [None]:
import pyarrow.hdfs

def calculate_total_file_size(hdfs_host, hdfs_port, hdfs_directory):
    # Connect to HDFS
    hdfs = pyarrow.hdfs.connect(host=hdfs_host, port=hdfs_port)
    
    # Get file status for each file in the directory
    file_statuses = hdfs.ls(hdfs_directory, detail=True)
    
    # Calculate total file size
    total_size = sum(fs.size for fs in file_statuses)
    
    # Close the HDFS connection
    hdfs.close()
    
    return total_size

# Example usage
hdfs_host = 'localhost'
hdfs_port = 9000
hdfs_directory = '/user/data'

total_size = calculate_total_file_size(hdfs_host, hdfs_port, hdfs_directory)
print(f"Total file size in HDFS directory '{hdfs_directory}': {total_size} bytes")

**Q3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.**

**Ans :**

In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class TopNWords(MRJob):
    
    def configure_args(self):
        super(TopNWords, self).configure_args()
        self.add_passthru_arg('--N', type=int, default=10, help='Number of top words to display')
    
    def mapper_get_words(self, _, line):
        words = re.findall(r'\w+', line.lower())
        for word in words:
            yield word, 1
    
    def combiner_count_words(self, word, counts):
        yield word, sum(counts)
    
    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)
    
    def reducer_find_top_words(self, _, word_count_pairs):
        N = self.options.N
        top_words = sorted(word_count_pairs, reverse=True)[:N]
        for count, word in top_words:
            yield word, count
    
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_top_words)
        ]

if __name__ == '__main__':
    TopNWords.run()

**Q4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.**

**Ans :** 

In [None]:
import requests

# Hadoop cluster URL
hadoop_url = "http://<namenode_host>:<port>"

def check_namenode_status():
    namenode_url = hadoop_url + "/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
    response = requests.get(namenode_url)
    if response.status_code == 200:
        data = response.json()
        state = data['beans'][0]['State']
        live_nodes = data['beans'][0]['NumLiveDataNodes']
        print("NameNode Status:")
        print("State: ", state)
        print("Live Nodes: ", live_nodes)
    else:
        print("Error checking NameNode status.")

def check_datanode_status():
    datanode_url = hadoop_url + "/jmx?qry=Hadoop:service=DataNode,name=DataNodeStatus"
    response = requests.get(datanode_url)
    if response.status_code == 200:
        data = response.json()
        live_nodes = data['beans'][0]['NumLiveDataNodes']
        dead_nodes = data['beans'][0]['NumDeadDataNodes']
        print("DataNode Status:")
        print("Live Nodes: ", live_nodes)
        print("Dead Nodes: ", dead_nodes)
    else:
        print("Error checking DataNode status.")

if __name__ == '__main__':
    check_namenode_status()
    print()
    check_datanode_status()

**Q5. Develop a Python program that lists all the files and directories in a specific HDFS path.**

**Ans :**

In [None]:
import pyarrow.hdfs as hdfs

def list_hdfs_path(hdfs_path):
    fs = hdfs.connect()
    contents = fs.ls(hdfs_path)
    
    print(f"Contents of {hdfs_path}:")
    for item in contents:
        print(item)

if __name__ == '__main__':
    hdfs_path = '/path/to/hdfs/directory'  # Replace with your desired HDFS path
    list_hdfs_path(hdfs_path)

**Q6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.**

**Ans :**

In [None]:
import requests

def analyze_data_node_storage_utilization():
    # Retrieve DataNode information from Hadoop's REST API
    response = requests.get('http://localhost:9870/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo')
    data = response.json()
    datanode_info = data['beans'][0]
    
    # Extract storage utilization details
    used_storage = datanode_info['used']
    remaining_storage = datanode_info['remaining']
    capacity_storage = datanode_info['capacity']
    
    # Calculate percentage utilization
    utilization_percentage = (used_storage / capacity_storage) * 100
    
    # Identify DataNodes with highest and lowest storage capacities
    all_datanodes = datanode_info['hostName']
    sorted_datanodes = sorted(all_datanodes, key=lambda x: capacity_storage[x])
    highest_storage_node = sorted_datanodes[-1]
    lowest_storage_node = sorted_datanodes[0]
    
    # Display storage utilization information
    print(f"Storage Utilization: {utilization_percentage:.2f}%")
    print(f"Highest Storage Capacity Node: {highest_storage_node}")
    print(f"Lowest Storage Capacity Node: {lowest_storage_node}")

if __name__ == '__main__':
    analyze_data_node_storage_utilization()

**Q7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.**

**Ans :**

In [None]:
import requests
import time

def submit_hadoop_job(jar_path, input_path, output_path):
    # Submit Hadoop job
    data = {
        'jar': jar_path,
        'class': 'org.apache.hadoop.examples.WordCount',
        'args': [input_path, output_path]
    }
    response = requests.post('http://localhost:8088/ws/v1/cluster/apps/new-application')
    app_id = response.json()['application-id']
    requests.post(f'http://localhost:8088/ws/v1/cluster/apps/{app_id}/application-submit', json=data)
    
    # Monitor job progress
    while True:
        response = requests.get(f'http://localhost:8088/ws/v1/cluster/apps/{app_id}')
        status = response.json()['app']['state']
        if status == 'FINISHED':
            break
        elif status in ['FAILED', 'KILLED']:
            print('Job execution failed.')
            return
        time.sleep(5)
    
    # Retrieve final output
    response = requests.get(f'http://localhost:8088/ws/v1/cluster/apps/{app_id}/appattempts')
    attempt_id = response.json()['appAttempts']['appAttempt'][0]['appAttemptId']
    response = requests.get(f'http://localhost:8088/ws/v1/cluster/apps/{app_id}/appattempts/{attempt_id}/containers')
    container_id = response.json()['containers']['container'][0]['containerId']
    response = requests.get(f'http://localhost:8088/ws/v1/cluster/apps/{app_id}/appattempts/{attempt_id}/containers/{container_id}/logs')
    logs = response.json()['containerLog']['log']
    output = logs.split('REDUCER ')[1]
    
    print('Final Output:')
    print(output)

if __name__ == '__main__':
    submit_hadoop_job('/path/to/hadoop-examples.jar', '/input/path', '/output/path')

**Q8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.**

**Ans :**

In [None]:
import requests
import time

def submit_hadoop_job(jar_path, input_path, output_path, num_containers, container_memory):
    # Submit Hadoop job
    data = {
        'jar': jar_path,
        'class': 'org.apache.hadoop.examples.WordCount',
        'args': [input_path, output_path],
        'amContainerSpec': {
            'commands': {
                'command': 'yarn jar {} WordCount {} {}'.format(jar_path, input_path, output_path)
            },
            'resource': {
                'vCores': 1,
                'memory': 1024
            }
        },
        'resource': {
            'vCores': num_containers,
            'memory': container_memory
        }
    }
    response = requests.post('http://localhost:8088/ws/v1/cluster/apps/new-application')
    app_id = response.json()['application-id']
    requests.post(f'http://localhost:8088/ws/v1/cluster/apps/{app_id}/application-submit', json=data)
    
    # Monitor job progress and resource usage
    while True:
        response = requests.get(f'http://localhost:8088/ws/v1/cluster/apps/{app_id}')
        status = response.json()['app']['state']
        if status == 'FINISHED':
            break
        elif status in ['FAILED', 'KILLED']:
            print('Job execution failed.')
            return
        resources_used = response.json()['app']['allocatedResources']
        print(f'Resources used: vCores - {resources_used["vCores"]}, Memory - {resources_used["memory"]}')
        time.sleep(5)

if __name__ == '__main__':
    submit_hadoop_job('/path/to/hadoop-examples.jar', '/input/path', '/output/path', 4, 2048)

**Q9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.**

**Ans :**

In [None]:
from mrjob.job import MRJob
import time

class WordCountJob(MRJob):
    
    def configure_args(self):
        super(WordCountJob, self).configure_args()
        self.add_passthru_arg('--split-size', default=64, help='Input split size in MB')

    def mapper(self, _, line):
        for word in line.strip().split():
            yield word, 1

    def reducer(self, word, counts):
        yield word, sum(counts)

if __name__ == '__main__':
    start_time = time.time()
    job = WordCountJob(args=['input.txt'])
    job.run_job()
    end_time = time.time()
    execution_time = end_time - start_time
    print(f'Execution time with default split size: {execution_time:.2f} seconds')

    # Run the job with different split sizes
    split_sizes = [64, 128, 256, 512]
    for split_size in split_sizes:
        start_time = time.time()
        job = WordCountJob(args=['--split-size', str(split_size), 'input.txt'])
        job.run_job()
        end_time = time.time()
        execution_time = end_time - start_time
        print(f'Execution time with split size {split_size} MB: {execution_time:.2f} seconds')