# National Family Health Survey, India - NFHS-5 

### Web Scraping 

The National Family Health Survey 2019-21 (NFHS-5), the fifth in the NFHS series, provides information on
population, health, and nutrition for India and each state/union territory (UT)

National Family Health Survey 5 (2019-21) fact sheets provides informations and trends on India and all the States/UTs. Fact sheet tables contains NFHS-5 and NFHS-4 values of 131 key indicators. These fact sheets are given in PDF file format.

Below code scraps and extracts data from 37 PDF files (i.e India and all the States/UTs). The Output dataset contains country, state, district, households, women, and men values along with 131 key indicator values of India and all the states/UTs. Final dataset values are given in JSON format.


In [1]:
# Import all the necessary libraries
from bs4 import BeautifulSoup
import requests
import PyPDF2
import io
import re
import csv

In [2]:
# National Family Health Survey India webpage URL
url = "http://rchiips.org/nfhs/factsheet_NFHS-5.shtml"

# Get HTML file from the given url
html_doc = requests.get(url)

# check the status of getting HTML file from url
html_doc.status_code

200

In [8]:
# Get pdf urls list from nfhs website using BeautifulSoup library
# Create a beautiful soup object for the HTML file to manipulate and extract data from it.

soup = BeautifulSoup(html_doc.text, 'html.parser')

pdf_list = []
a = soup.find("select")
b = a.find_all(target="new")

for i in b:    
    pdf_list.append('http://rchiips.org/nfhs/'+ i['value'])


pdf_list

['http://rchiips.org/nfhs/NFHS-5_FCTS/India.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Andhra_Pradesh.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Arunachal_Pradesh.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Assam.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Bihar.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Chhattisgarh.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Goa.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Gujarat.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Haryana.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Himachal_Pradesh.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Jharkhand.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Karnataka.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Kerala.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Madhya_Pradesh.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Maharashtra.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Manipur.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Meghalaya.pdf',
 'http://rchiips.org/nfhs/NFHS-5_FCTS/Mizoram.pdf',
 'http://rchiips.org/nfhs/N

In [4]:
columns_list = ['Country', 'State', 'District', 'Households', 'Women', 'Men']
rows_list = []
main_category = ""
sub_category = ""
set_up_columns = True

In [5]:
def start_match(text):
    indicator_match = re.search("^[0-1][0-3][0-9]\s*\.\s[A-Z]|^[0-9][0-9]\.\s[A-Z]|^[0-9]\.\s[A-Z]", text)
    return indicator_match

def middle_match(text):
    modulo_match = re.search(r"\(\%\)\B", text) 
    return modulo_match

def end_match(text):
    value_match = re.search("(\s+(\d+|\*|na|\d+,\d+|\d+\.\d+|\(\d+\)|\(\d+\.\d+\)|\(\d+\,\d+\)|\(\d+\s\.\d+\))){4}$", text.rstrip())
    return value_match

def format_column_value(value):
    x = re.search("^\d+\.\s", value)
    if x:
        return value[x.span()[1]:]

def format_row_value(value):
    regex_list = [("\s?\.\s?","."),("\s?\,\s?",","),(",|\(|\)","")]
    for i in range(len(regex_list)):
        value = re.sub(regex_list[i][0],  regex_list[i][1], value)
        
    return value.split()

# Remove start and end spaces 
def remove_start_end_spaces(text):
    return "".join(text.rstrip().lstrip())

# Remove all spaces
def remove_all_spaces(string):
    return "".join(string.split())

# Remove all extra spaces
def remove_all_extra_spaces(text):
    return " ".join(text.split())

def sub_category_match(text):
    sub_match = re.search("Women$|Men$|Maternity Care \(for last birth in the 5 years before the survey\)$", text.rstrip())
    return sub_match

def key_category_match(current_line, next_line):
    next_line = remove_all_extra_spaces(next_line)
    urt_match = re.search("(\s+(Urban|Rural|Total)){4}$", current_line.rstrip())    
    if urt_match:
        return current_line[:urt_match.span()[0]]
    
    next_line_start_match = start_match(next_line)
    if next_line_start_match:        
        return current_line.rstrip()    
       
    next_line_sub_category_match = sub_category_match(next_line)
    if next_line_sub_category_match:
        return current_line.rstrip()
    else:      
        return False
    
def get_hwm(text):  
    '''
    Get number of Households, women, and men interviewed
    '''
    match = re.search("\s(\d|,|\s)*\shouseholds,\s(\d|,|\s)*\swomen,\sand\s(\d|,|\s)*\smen", text)
    hwm = []
    if match:        
        x = [match.group()]
        y = ["households,", "women, and", "men"]
        for i in range(len(y)):         
            x = x[0].split(y[i])         
            hwm.append(remove_all_spaces(x.pop(0)))    
    else:
        hwm = [None, None, None]
    
    return hwm

def get_csd(text):
    '''
    Get Country, State and District value
    '''
    url = text[0]
    content = text[1]   
    country_match = re.search("(India.pdf)$", url)
    if country_match:
            return ["India", "na", "na"]
    else:
        district_match = re.search("(/[A-Z]{2}/[A-Za-z_]*.pdf)$", url)  

        if district_match:
            d = url[district_match.span()[0]:]
            district = remove_start_end_spaces(re.sub("/[A-Z]{2}/|_|(.pdf)$", " ", d))
            state = content.split(district)
            return ["India", state[1], district]
        else:
            state_match = re.search("(/[A-Za-z_]*.pdf)$", url)
            if state_match:
                s = url[state_match.span()[0]:]         
                state = remove_start_end_spaces(re.sub("/|_|(.pdf)$", " ", s))
                return ["India", state, "na"]
            else:
                return ["na", "na", "na"]
    

def row_value(text):
    '''
    Convert the row value into json format.
    '''
    global main_category 

    values = format_row_value(text)   
    str_format = '{ "Category" : "' + main_category + '", "NFHS-5" : { "Urban" : '+ values[0] +', "Rural" : '+ values[1] +', "Total" : '+ values[2] +' }, "NFHS-4" : { "Total" : '+ values[3] +' } }'
    return str_format
    

In [6]:
#  Define Columns
# ------------------
#  First row of the final csv file is the header/columns of the dataset.
#  Header/Columns includes 131 key indicators along with country, state, district, households, women, and men.
#  131 key indicators are fetched from the pdf file
#  country, state, district, households, women, and men are manually inserted via user-defined list (columns_list).
#  concatinating manually entered columns and 131 key indicators into a single list (columns_list)
#  and writing it down into the final csv file. 
# 
#  Some of the column names(key indicator names) have a sub-category value:
#  For example:
#  Blood Sugar Level among Adults (age 15 years and above ) --> Main Category of Key Indicator
#  Women                                                    --> Sub-category of Key Indicator
#  99. Blood sugar level - very high (>160 mg/dl)23 (%)     --> Key Indicator (Column name)
# 
#  Main Category of key indicator is passed into the row value (read - Define Rows)
#  Sub-category of the key indicator is concated with the column name like this
#  Blood sugar level - very high (>160 mg/dl)23 (%) - Women
#  
# 
#  Define Rows 
# ---------------
#  Rows(Key_indicators values) are extracted and formatted using the functions mentioned in the above cell (In[5])
#  All 131 key indicator values of India and each states/UTs are added into the rows in json format
# 
#  Sample json formatted row value of the key indicator - Female population age 6 years and above who ever attended school (%): 
# 
#  { "Category" : "Population and Household Profile",  # Main category of the key indicator
#    "NFHS-5" : {                                      
#                  "Urban" : 82.5,        # NFHS-5 Urban value of Female population age 6 years and above who ever attended school (%)
#                  "Rural" : 66.8,        # NFHS-5 Rural value of Female population age 6 years and above who ever attended school (%)
#                  "Total" : 71.8 },      # NFHS-5 Total value of Female population age 6 years and above who ever attended school (%)
#    "NFHS-4" : { 
#                  "Total" : 68.8 }       # NFHS-4 Total value of Female population age 6 years and above who ever attended school (%)
#  }
# 
# 

def extract_data(pdf_content):
    global main_category, sub_category 
    x = pdf_content.splitlines() 

    for i in range(len(x)-1): 
        x[i] = remove_all_extra_spaces(x[i])

        key_indicator_match = start_match(x[i])
        
        if key_indicator_match:          
            modulo_match = middle_match(x[i])

            if modulo_match:
                if set_up_columns: 
                    formatted_column_value = format_column_value(x[i][:modulo_match.span()[1]])
                    columns_list.append(formatted_column_value + sub_category)
                    
                rows_list.append(row_value(x[i][modulo_match.span()[1]:]))
   
            if modulo_match is None:            
                value_match = end_match(x[i])

                if value_match:
                    if set_up_columns: 
                        formatted_column_value = format_column_value(x[i][:value_match.span()[0]])
                        columns_list.append(formatted_column_value + sub_category)
                        
                    rows_list.append(row_value(x[i][value_match.span()[0]:]))
   
                if value_match is None:
                    next_line_value_match = end_match(x[i+1])

                    if next_line_value_match:
                        #combine current line and next line
                        if set_up_columns: 
                            formatted_column_value = format_column_value(x[i]+" "+x[i+1][:next_line_value_match.span()[0]])
                            columns_list.append(formatted_column_value + sub_category)
                            
                        rows_list.append(row_value(x[i+1][next_line_value_match.span()[0]:]))
   
                    if next_line_value_match is None:
                        print("-----------------------------------------------")


        if key_indicator_match is None: 
            # Don't change the condition checking order.
            # First Condition
            # Check if the current line is a key indicators second line 
            # by checking if the current line has 4 values at the end of the line/string.
            second_line_of_key_indicators = end_match(x[i])
            if second_line_of_key_indicators:
                continue

            # Second Condition
            # Check if the current line is a sub-category
            sub_key_category_match = sub_category_match(x[i])
            if sub_key_category_match:
                sub_category = " - " + x[i]
                continue   

            # Third Condition
            # Check if the current line is the main key-indicator category
            main_key_category_match = key_category_match(x[i], x[i+1])
            if main_key_category_match:
                main_category = main_key_category_match
                sub_category = ""
            

def write_csv(data): 
    with open('state-nfhs5.csv', 'w', encoding='utf_8_sig' ,newline='') as f:
        writer = csv.writer(f)       

        # write the column data
        writer.writerow(data)
        

def add_row(data):
    with open('state-nfhs5.csv', 'a', encoding='utf_8_sig', newline='') as f:
        writer = csv.writer(f)       

        # write the data
        writer.writerow(data)


In [7]:
for n in range(len(pdf_list)):
    full_page_content = ""
    num_pages = ""
    
    print(pdf_list[n])
    response = requests.get(pdf_list[n])
    with io.BytesIO(response.content) as open_pdf_file:
        read_pdf = PyPDF2.PdfFileReader(open_pdf_file)    
        num_pages = read_pdf.getNumPages()    

        for i in range(num_pages):
            print(i)
            content = read_pdf.getPage(i)
            page_content = content.extractText()

            if i == 0:
                text = remove_all_extra_spaces(page_content)
                csd = get_csd([pdf_list[n],text])
                rows_list.extend(csd)            

            if i == 1:
                hwm = get_hwm(remove_all_extra_spaces(page_content))
                rows_list.extend(hwm)

            if i in range(2, num_pages-1):            
                full_page_content += "\n" + page_content
        
        extract_data(full_page_content)
        
       # write column data  
        if n == 0:
            write_csv(columns_list)            
            set_up_columns = False
        
        # write row data
        add_row(rows_list)
        rows_list.clear()


http://rchiips.org/nfhs/NFHS-5_FCTS/India.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Andhra_Pradesh.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Arunachal_Pradesh.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Assam.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Bihar.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Chhattisgarh.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Goa.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Gujarat.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Haryana.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Himachal_Pradesh.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Jharkhand.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Karnataka.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Kerala.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Madhya_Pradesh.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS/Maharashtra.pdf
0
1
2
3
4
5
6
http://rchiips.org/nfhs/NFHS-5_FCTS