# This file is to analyze the loan applications information in a HDFS (Hadoop Distributed File System) background. The HDFS now has two live datanodes. 
### Author: Xingjian (James) Tian

In [1]:
import requests
import pyarrow as pa
import pyarrow.fs
import io
import re

In [2]:
# Q1: Check how many live DataNodes are in the cluster?
!hdfs dfsadmin -fs hdfs://boss:9000 -report

Configured Capacity: 51642105856 (48.10 GB)
Present Capacity: 15416885248 (14.36 GB)
DFS Remaining: 15416827904 (14.36 GB)
DFS Used: 57344 (56 KB)
DFS Used%: 0.00%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (2):

Name: 172.19.0.2:9866 (project_hdfs-dn-2.project_hdfs_default)
Hostname: 1427c7fc7f7b
Decommission Status : Normal
Configured Capacity: 25821052928 (24.05 GB)
DFS Used: 28672 (28 KB)
Non DFS Used: 18095833088 (16.85 GB)
DFS Remaining: 7708413952 (7.18 GB)
DFS Used%: 0.00%
DFS Remaining%: 29.85%
Configured Cache

##### For the output above, we could find that the output contains a line **"Live datanodes (2):"** which means that there are two live datanodes now. 

### Then, I just downloaded one file which is quite similar with the file we dealt with in CS544. The file can be accessed by https://ffiec.cfpb.gov/v2/data-browser-api/view/csv?states=WI&years=2024

### The dataset include detailed information about mortgage loan applications, originations, denials, and more, making it useful for advanced analysis of mortgage lending trends and financial compliance. I have chosen to analyze loan applications from California for the year 2023.

In [3]:
#!hdfs dfs -rm -f hdfs://boss:9000/single.csv # Uncomment if needed
#!hdfs dfs -rm -f hdfs://boss:9000/double.csv # Uncomment if needed

# Downloads the dataset for California (state=CA) from the HMDA database for the year 2023.
# The file is saved as "state_CA.csv" in the local directory.
!wget -nc -O state_CA.csv "https://ffiec.cfpb.gov/v2/data-browser-api/view/csv?states=CA&years=2023"

# Uploads the local file "state_CA.csv" to HDFS at "hdfs://boss:9000/single.csv".
# `-D dfs.block.size=1048576`: Sets the block size to 1 MB for this upload.
# `-D dfs.replication=1`: Sets the replication factor to 1 (no additional replicas).
# The file is now stored in HDFS with one block replica.
!hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=1 -cp state_CA.csv hdfs://boss:9000/single.csv

# Uploads the local file "state_CA.csv" to HDFS at "hdfs://boss:9000/double.csv".
# `-D dfs.block.size=1048576`: Sets the block size to 1 MB for this upload.
# `-D dfs.replication=2`: Sets the replication factor to 2 (two replicas for fault tolerance).
# The file is now stored in HDFS with two block replicas.
!hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=2 -cp state_CA.csv hdfs://boss:9000/double.csv
!hdfs dfs -ls hdfs://boss:9000/

--2024-12-25 04:19:02--  https://ffiec.cfpb.gov/v2/data-browser-api/view/csv?states=CA&years=2023
Resolving ffiec.cfpb.gov (ffiec.cfpb.gov)... 23.12.13.114, 2600:1407:3c00:1583::31d7, 2600:1407:3c00:1580::31d7
Connecting to ffiec.cfpb.gov (ffiec.cfpb.gov)|23.12.13.114|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cfpb-hmda-public.s3.amazonaws.com/prod/data-browser/2023/filtered-queries/snapshot/9fce30c1c3dac145d3783c944f4beb1e.csv [following]
--2024-12-25 04:19:03--  https://cfpb-hmda-public.s3.amazonaws.com/prod/data-browser/2023/filtered-queries/snapshot/9fce30c1c3dac145d3783c944f4beb1e.csv
Resolving cfpb-hmda-public.s3.amazonaws.com (cfpb-hmda-public.s3.amazonaws.com)... 3.5.28.212, 52.216.222.49, 16.182.38.1, ...
Connecting to cfpb-hmda-public.s3.amazonaws.com (cfpb-hmda-public.s3.amazonaws.com)|3.5.28.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 360608549 (344M) [text/csv]
Saving to: ‘state_CA.cs

In [4]:
# Q2: what are the logical and physical sizes of the CSV files?
!hdfs dfs -du -h hdfs://boss:9000/double.csv
!hdfs dfs -du -h hdfs://boss:9000/single.csv

343.9 M  687.8 M  hdfs://boss:9000/double.csv
343.9 M  343.9 M  hdfs://boss:9000/single.csv


##### Answer for **Q2**: The first columns show the logical and physical sizes. 
##### **single.csv**: Stored with a replication factor of 1, so physical size = logical size (343.9 M).
##### **double.csv**: Stored with a replication factor of 2, so physical size = 2 × logical size (687.8 M).

# WebHDFS

In [5]:
# Q3: what is the file status for single.csv?

# Bt defult, WebHDFS runs on port 9870.
# The GETFILESTATUS operation retrieves metadata about a specific file in HDFS, such as its size, replication factor, block size, etc.
r= requests.get("http://boss:9870/webhdfs/v1/single.csv?op=GETFILESTATUS")
r.raise_for_status()
r.json()

{'FileStatus': {'accessTime': 1735100369146,
  'blockSize': 1048576,
  'childrenNum': 0,
  'fileId': 16386,
  'group': 'supergroup',
  'length': 360608549,
  'modificationTime': 1735100380817,
  'owner': 'root',
  'pathSuffix': '',
  'permission': '644',
  'replication': 1,
  'storagePolicy': 0,
  'type': 'FILE'}}

##### Key fields explanation: 
##### blockSize: The block size of the file, which is **1 MB (1048576 bytes)**.
##### length: The logical size of the file in bytes, which is about **343 MB**.
##### replication: The replication factor of the file, which is 1 (no additional replicas).

In [6]:
# Q4: what is the location for the first block of single.csv?

# The OPEN operation retrieves a URL that points to the DataNode hosting the requested file or block.
# Query Parameters: offset=0 specifies the offset (starting point) of the data to retrieve. 
# noredirect=true instructs the NameNode not to redirect to the actual DataNode URL directly, instead it provided the DataNode location as a response. 
r= requests.get("http://boss:9870/webhdfs/v1/single.csv?op=OPEN&offset=0&noredirect=true")
r.raise_for_status()
r.json()['Location']

'http://1427c7fc7f7b:9864/webhdfs/v1/single.csv?op=OPEN&namenoderpcaddress=boss:9000&offset=0'

In [7]:
# Q5: how are the blocks of single.csv distributed across the two DataNode containers?
r= requests.get("http://boss:9870/webhdfs/v1/single.csv?op=GETFILEBLOCKLOCATIONS")
r.raise_for_status()
info = r.json()["BlockLocations"]["BlockLocation"]
answer = {}
# Iterate through each block location dictionary
for dict in info:
    # Extract the first host (DataNode) where the block is stored
    host = dict["hosts"][0]
    if host in answer:
        answer[host] +=1
    else:
        answer[host] = 1

answer

{'1427c7fc7f7b': 150, 'dcf2a6d93ee2': 194}

In [8]:
# Creating a connection to HDFS using PyArrow, enabling me to perform many file operations in Python
hdfs = pa.fs.HadoopFileSystem("boss", 9000)

2024-12-25 04:20:18,008 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
# Q6: what are the first 10 bytes of single.csv?

# Open the file "single.csv" stored in HDFS for reading.
file = hdfs.open_input_file("hdfs://boss:9000/single.csv")

# Read the first 10 bytes of the file, starting at offset 0 (the beginning of the file).
# Note: read_at(size, offset) reads 'size' bytes starting from the specified 'offset'.
file.read_at(10, 0)

b'activity_y'

In [10]:
# Q7: how many lines of single.csv contain the string "Single Family"?


with hdfs.open_input_file("hdfs://boss:9000/single.csv") as f:
    # Wrap the file object with a TextIOWrapper to read it as text.
    # io.BufferedReader ensures efficient reading of the file in chunks.
    reader = io.TextIOWrapper(io.BufferedReader(f))
    pattern = r"Single Family"
    count = 0
    # Iterate through each line in the file, keeping track of line numbers using enumerate.
    for i,line in enumerate(reader):
        count += len(re.findall(pattern, line))
count

940991