In [None]:
pip install pdfplumber

For PDF link

In [13]:
import pdfplumber
import pandas as pd
from pathlib import Path
import openpyxl
import re

def remove_illegal_characters(text):
    """
    Remove characters that are not allowed in Excel.
    """
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F]')
    return ILLEGAL_CHARACTERS_RE.sub('', text)


def clean_text(text):
    """
    Clean text by removing extra spaces and newlines
    """
    return ' '.join(text.split())

def find_section(text, section):
    """
    Find section considering possible line breaks and variations
    Returns the index of the section if found, -1 otherwise
    """
    if section == "Key words":
        idx = text.find("Key words")
        if idx != -1:
            return idx
        idx = text.find("Keywords")
        if idx != -1:
            return idx
    
    idx = text.find(section)
    if idx != -1:
        return idx
    
    section_space = ' '.join(section.split())
    idx = text.find(section_space)
    if idx != -1:
        return idx
    
    parts = section.split()
    if len(parts) > 1:
        first_part = parts[0]
        idx = text.find(first_part)
        if idx != -1:
            next_text = text[idx:idx+len(section)+10]
            next_text_clean = clean_text(next_text)
            if section_space in next_text_clean:
                return idx
    
    return -1

def extract_sections_from_pdf(pdf_path, sections):
    """
    Extract sections from PDF using a simple linear search approach.
    Handles multi-line section headers.
    """
    extracted_sections = {section: "" for section in sections}
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = " ".join([page.extract_text() for page in pdf.pages])
            text = clean_text(text)
            
            for i, section in enumerate(sections):
                start_idx = find_section(text, section)
                if start_idx != -1:
                    if section == "Key words" and text[start_idx:start_idx+8] == "Keywords":
                        content_start = start_idx + 8
                    else:
                        content_start = start_idx + len(section)
                    
                    content_end = len(text)
                    for next_section in sections[i+1:]:
                        next_idx = find_section(text, next_section)
                        if next_idx != -1:
                            content_end = next_idx
                            break
                    
                    content = text[content_start:content_end].strip()
                    extracted_sections[section] = remove_illegal_characters(content)

    
    except Exception as e:
        print(f"Error reading PDF: {str(e)} in file: {pdf_path}")  
        return None
    
    return extracted_sections

def process_pdfs_to_excel(pdf_folder, output_excel, sections, num_files=None):
    """
    Process PDFs in a folder and create an Excel file.
    """
    pdf_files = list(Path(pdf_folder).glob('*.pdf'))
    if num_files:
        pdf_files = pdf_files[:num_files]
    
    all_data = []
    total_files = len(pdf_files)
    
    print(f"Processing {total_files} files...")
    
    for i, pdf_file in enumerate(pdf_files, 1):
        if i % 400 == 0 or i == total_files:
            print(f"Processing ({i}/{total_files}) files...")
        
        sections_data = extract_sections_from_pdf(pdf_file, sections)
        
        if sections_data:
            sections_data['Filename'] = pdf_file.name
            all_data.append(sections_data)
    
    if not all_data:
        print("No data was successfully extracted from any PDF.")
        return
    
    df = pd.DataFrame(all_data)
    
    for col in ['Filename'] + sections:
        if col not in df.columns:
            df[col] = ""
    
    columns = ['Filename'] + sections
    df = df[columns]
    
    df.to_excel(output_excel, index=False)
    print(f"\nProcessed {len(all_data)} files successfully")
    print(f"Saved to: {output_excel}")
    return df

sections = [
    "Overview",
    "Performance criteria",
    "Knowledge and understanding",
    "Additional Information",
    "Developed by",
    "Key words"  
]

pdf_folder = r"C:\\Users\\amith\\Kenpath\\NOS"
output_excel = r"C:\\Users\\amith\\Kenpath\\parsed_nos.xlsx"
df=process_pdfs_to_excel(pdf_folder, output_excel, sections)


Processing 15295 files...
Processing (400/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Apply_species_identification_skills__Legacy.pdf
Processing (800/15295) files...
Processing (1200/15295) files...
Processing (1600/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Carry_out_tremie_operations.pdf
Processing (2000/15295) files...
Processing (2400/15295) files...
Processing (2800/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Contribute_to_the_management_of_finances_for_properties_in_England,_Wales_and_Northern_Ireland.pdf
Processing (3200/15295) files...
Processing (3600/15295) files...


Data-loss while decompressing corrupted data


Processing (4000/15295) files...
Processing (4400/15295) files...
Processing (4800/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Engage_participants_in_community_arts_activities_.pdf
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Erect_fire_resisting_walls_and_wall_linings.pdf
Processing (5200/15295) files...
Processing (5600/15295) files...
Processing (6000/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Identify_opportunities_to_increase_retail_sales_of_particular_products_Legacy.pdf




Processing (6400/15295) files...
Processing (6800/15295) files...
Processing (7200/15295) files...
Processing (7600/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Maintain_an_asset_register_and_service_history_information_in_healthcare.pdf


Data-loss while decompressing corrupted data


Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Maintain_the_performance_of_electronic_security_systems_LEGACY.pdf




Processing (8000/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Make_sure_your_own_actions_reduce_risks_to_health_and_safety_Legacy.pdf
Processing (8400/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Manage_relationships_during_change_management_programmes_for_IT_enabled_systems.pdf


Data-loss while decompressing corrupted data


Processing (8800/15295) files...
Processing (9200/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Optimise_and_control_contract_progress_and_costs_in_construction_contracting_operations_management_Legacy.pdf
Processing (9600/15295) files...


Data-loss while decompressing corrupted data


Processing (10000/15295) files...
Processing (10400/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Prepare_for_and_carry_out_industrial_pressure_cleaning_operations.pdf
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Prepare_raw_materials_and_equipment_for_processing.pdf
Processing (10800/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Process_applications_for_membership_to_a_collecting_society.pdf
Processing (11200/15295) files...
Processing (11600/15295) files...
Processing (12000/15295) files...
Processing (12400/15295) files...
Processing (12800/15295) files...
Processing (13200/15295) files...
Processing (13600/15295) files...
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Support_children_and_young_people_through_major_transitions_.pdf
Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Support_children’s_play_and_learning.pdf
Processing (140

Data-loss while decompressing corrupted data


Error reading PDF: Unexpected EOF in file: C:\Users\amith\Kenpath\NOS\Work_safely_and_follow_reporting_procedures.pdf
Processing (15200/15295) files...
Processing (15295/15295) files...

Processed 15278 files successfully
Saved to: C:\\Users\\amith\\Kenpath\\parsed_nos.xlsx


For WEB Link

In [2]:
import pdfplumber
import pandas as pd
from pathlib import Path
import openpyxl
import re

def remove_illegal_characters(text):
    """
    Remove characters that are not allowed in Excel.
    """
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F]')
    return ILLEGAL_CHARACTERS_RE.sub('', text)


def clean_text(text):
    """
    Clean text by removing extra spaces and newlines
    """
    return ' '.join(text.split())

def find_section(text, section):
    """
    Find section considering possible line breaks and variations
    Returns the index of the section if found, -1 otherwise
    """
    # Handle "Key words" variations
    if section == "Key words":
        idx = text.find("Key words")
        if idx != -1:
            return idx
        idx = text.find("Keywords")
        if idx != -1:
            return idx
            
    # Handle "Additional Information" variations
    if section == "Additional Information":
        variations = [
            "Additional Information",
            "Behaviors",
            "Glossary",
            "Scope/range related to performance criteria",
            "Scope/range",
            "Skills",
            "Scope/range related to knowledge and understanding"
        ]
        # Find the earliest occurrence of any variation
        indices = []
        for variation in variations:
            idx = text.find(variation)
            if idx != -1:
                indices.append(idx)
        
        # Return the earliest index if any variation was found
        if indices:
            return min(indices)
    
    # Original section name
    idx = text.find(section)
    if idx != -1:
        return idx
    
    # Try with space instead of newline
    section_space = ' '.join(section.split())
    idx = text.find(section_space)
    if idx != -1:
        return idx
    
    # Try with parts of split sections
    parts = section.split()
    if len(parts) > 1:
        # Try finding first part followed by second part with possible line break
        first_part = parts[0]
        idx = text.find(first_part)
        if idx != -1:
            # Check if next part follows within reasonable distance
            next_text = text[idx:idx+len(section)+10]  # Add some buffer for line breaks
            next_text_clean = clean_text(next_text)
            if section_space in next_text_clean:
                return idx
    
    return -1

def extract_sections_from_pdf(pdf_path, sections):
    """
    Extract sections from PDF using a simple linear search approach.
    Handles multi-line section headers.
    """
    extracted_sections = {section: "" for section in sections}
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = " ".join([page.extract_text() for page in pdf.pages])
            text = clean_text(text)
            
            for i, section in enumerate(sections):
                start_idx = find_section(text, section)
                if start_idx != -1:
                    if section == "Key words" and text[start_idx:start_idx+8] == "Keywords":
                        content_start = start_idx + 8
                    else:
                        content_start = start_idx + len(section)
                    
                    content_end = len(text)
                    for next_section in sections[i+1:]:
                        next_idx = find_section(text, next_section)
                        if next_idx != -1:
                            content_end = next_idx
                            break
                    
                    content = text[content_start:content_end].strip()
                    extracted_sections[section] = remove_illegal_characters(content)

    
    except Exception as e:
        print(f"Error reading PDF: {str(e)} in file: {pdf_path}")  
        return None
    
    return extracted_sections

def process_pdfs_to_excel(pdf_folder, output_excel, sections, num_files=None):
    """
    Process PDFs in a folder and create an Excel file.
    """
    pdf_files = list(Path(pdf_folder).glob('*.pdf'))
    if num_files:
        pdf_files = pdf_files[:num_files]
    
    all_data = []
    total_files = len(pdf_files)
    
    print(f"Processing {total_files} files...")
    
    for i, pdf_file in enumerate(pdf_files, 1):
        if i % 400 == 0 or i == total_files:
            print(f"Processing ({i}/{total_files}) files...")
        
        sections_data = extract_sections_from_pdf(pdf_file, sections)
        
        if sections_data:
            sections_data['Filename'] = pdf_file.name
            all_data.append(sections_data)
    
    if not all_data:
        print("No data was successfully extracted from any PDF.")
        return
    
    df = pd.DataFrame(all_data)
    
    for col in ['Filename'] + sections:
        if col not in df.columns:
            df[col] = ""
    
    columns = ['Filename'] + sections
    df = df[columns]
    
    df.to_excel(output_excel, index=False)
    print(f"\nProcessed {len(all_data)} files successfully")
    print(f"Saved to: {output_excel}")
    return df

sections = [
    "Overview",
    "Performance criteria",
    "Knowledge and understanding",
    "Additional Information",
    "Developed by",
    "Key words"  
]

pdf_folder = r"C:\Users\amith\Kenpath\NOS from web"
output_excel = r"C:\\Users\\amith\\Kenpath\\parsed_nos(web).xlsx"
df=process_pdfs_to_excel(pdf_folder, output_excel, sections)


Processing 7360 files...
Processing (400/7360) files...
Processing (800/7360) files...
Processing (1200/7360) files...
Processing (1600/7360) files...
Processing (2000/7360) files...
Processing (2400/7360) files...
Processing (2800/7360) files...
Processing (3200/7360) files...
Processing (3600/7360) files...
Processing (4000/7360) files...
Processing (4400/7360) files...
Processing (4800/7360) files...
Processing (5200/7360) files...
Processing (5600/7360) files...
Processing (6000/7360) files...
Processing (6400/7360) files...
Processing (6800/7360) files...
Processing (7200/7360) files...
Processing (7360/7360) files...

Processed 7360 files successfully
Saved to: C:\\Users\\amith\\Kenpath\\parsed_nos(web).xlsx


In [6]:
import pandas as pd

# Load the first Excel file
df1 = pd.read_excel("C:\\Users\\amith\\Kenpath\\parsed_nos(pdf).xlsx")

# Load the second Excel file
df2 = pd.read_excel("C:\\Users\\amith\\Kenpath\\parsed_nos(web).xlsx")





In [7]:
df1

Unnamed: 0,Filename,Overview,Performance criteria,Knowledge and understanding,Additional Information,Developed by,Key words
0,2D_Drawing_and_planning_software_LEGACY.pdf,,,,,,
1,Abrasive_wheel_operations_in_the_wood_industry...,This standard covers the skills and knowledge ...,You must be able to: 1. carry out risk assessm...,You need to know and 1. relevant health and sa...,,NSAFD Version Number 2 Date Approved March 201...,Timber; wood; abrasive; wheel; saw; cut; sawmi...
2,"Accept,_and_return,_responsibility_for_the_con...",This unit is about accepting and returning res...,You must be able to: P1 work safely at all tim...,You need to know and K1 the relevant health an...,"Glossary Health and safety legislation, regula...",People 1st Version number 1 Date approved Apri...,Responsibility; control; handover PPLRETRS17 A...
3,"Accept,_verify_and_store_materials_required_fo...",This unit is for those with responsibility for...,You must be able to: P1 work safely at all tim...,You need to know and K1 how to access and inte...,Glossary Key terms Examples Regulations and gu...,Cogent Version number 1 Date approved January ...,"accept, verify, store, materials, process, ope..."
4,"Acceptance_test_products,_equipment,_medical_d...",This standard relates to the critical examinat...,You must be able to: P1 work within your level...,You need to know and K1 your own level of comp...,External Links This standard links with the fo...,Skills for Health Version number 1 Date approv...,"calibrate, evaluate, procurement SFHCHS202 Acc..."
...,...,...,...,...,...,...,...
15273,Write_technical_content_for_inclusion_in_nucle...,This unit is about activity in the production ...,You must be able to: P1 confirm the technical ...,You need to know and K1 communication and pres...,,Cogent Version number 1 Date approved October ...,"write, technical content, safety case, data, p..."
15274,Write_text-based_material_for_multi-platform_u...,This standard is about writing text-based mate...,You must be able to: 1. write in a style that ...,You need to know and 1. the conventions and re...,,Creative Skillset Version Number 2 Date Approv...,"Radio, Audio, Multiplatform, Content, Style, A..."
15275,Write_varied_text_on_a_range_of_work_topics.pdf,You can write text (1-3 pages) for work and so...,You must be able to: P1 write formal and infor...,,Scope/range You can do this: 1. writing clearl...,Skills CFA Version number 1 Date approved Janu...,literacy; language; work; understand; communic...
15276,Writing_a_design_brief.pdf,This unit is about writing a design brief and ...,You must be able to: P1 define the profile and...,You need to know and K1 what your organisation...,,Creative and Cultural Skills Version number 1 ...,"design, brie, designer, business, objectives, ..."


In [8]:
df2

Unnamed: 0,Filename,Overview,Performance criteria,Knowledge and understanding,Additional Information,Developed by,Key words
0,Access_information_with_and_for_young_people_f...,This standard is about supporting young people...,You must be able to: 1. establish and assess t...,You need to know and understand: 1. why it is ...,,CLD Standards Council Scotland Version Number ...,Youth; young people; information; decisions; s...
1,Access_overhead_line_equipment_construction_si...,This standard identifies the competences you n...,You must be able to: P1 maintain safe working ...,You need to know and understand: K1 the releva...,performance criteria 1. Types of site access r...,NSAR Version Number 2 Date Approved 30 Apr 202...,Rail engineering; safety; access; overhead lin...
2,Access_trees_by_climbing_to_carry_out_arboricu...,This standard is about accessing trees by clim...,You must be able to: 1. perform a hazard evalu...,You need to know and understand: 1. how to ide...,,Lantra Version Number 1 Date Approved 28 Feb 2...,Tree; climb LANTw77L Access trees by climbing ...
3,Access_trees_by_climbing_to_carry_out_treework...,This standard covers all the activities that m...,You must be able to: 1. obtain the relevant in...,You need to know and understand: 1. how to ide...,equired to carry out work activities could inc...,Lantra Version Number 2 Date Approved 30 Apr 2...,trees; climb LANTw23 Access trees by climbing ...
4,"Acquire,_store_and_issue_resources_to_provide_...",As this standard applies across a range of wor...,You must be able to: Monitor and acquire resou...,You need to know and understand: Health and Sa...,ions are provided to explain how key words and...,Skills for Justice Version Number 3 Date Appro...,"Obtain, purchase, stock, monitor, supply, orde..."
...,...,...,...,...,...,...,...
7355,_Form_rolled_products_in_food_operations_.pdf,This standard is about the skills and knowledg...,You must be able to: 1. operate to the legal o...,You need to know and understand: 1. the potent...,,Improve Version Number 4 Date Approved 30 Mar ...,Food; drink; manufacturing; operations; rolled...
7356,_Improve_Environmental_Performance_in_Support_...,This standard describes the skill and knowledg...,You must be able to: P1 confirm that the resou...,You need to know and understand: K1 the reason...,wner/Manager; Environmental Manager; Environme...,Cogent Version Number 2 Date Approved 30 Mar 2...,environmental; performance; sustainable; busin...
7357,"_Pin,_block_and_shape_dough_in_bakery_operatio...",This standard covers the skills and knowledge ...,You must be able to: 1. check the available do...,You need to know and understand: 1. the standa...,,Improve Version Number 4 Date Approved 30 Mar ...,Dough; Pin; Block; Shape; Craft; Bakery; Baker...
7358,_Plan_the_transportation_of_goods_in_the_suppl...,This standard is about planning the transporta...,You must be able to: 1. identify the current t...,You need to know and understand: 1. your organ...,"permanent, temporary, agency staff, external •...",Lantra Version Number 2 Date Approved 30 Mar 2...,plan; transportation; goods; supply chain SFLS...


In [14]:
# Merge the two DataFrames row-wise
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save to a new Excel file
merged_df.to_excel("parsed_nos(pdf+web).xlsx", index=False)

print("Merge completed successfully!")


Merge completed successfully!


In [20]:

merged_df.columns = merged_df.columns.str.lower().str.replace(" ", "_")

# Rename specific column "file_name" to "nos_title"
merged_df.rename(columns={"filename": "nos_title"}, inplace=True)

# Save the updated DataFrame to a new Excel file
merged_df.to_excel("C:\\Users\\amith\\Kenpath\\parsed_nos(pdf+web).xlsx", index=False)

print("Column names updated successfully!")


Column names updated successfully!


In [23]:
import pandas as pd
import re

def standardize_title(title, remove_pdf=True):
    """
    Standardize title format by:
    1. Removing special characters
    2. Replacing spaces with underscores
    3. Optionally removing .pdf extension
    """
    # Remove special characters and replace spaces with underscores
    standardized = re.sub(r'[\\/*?:"<>|]', "", title).replace(" ", "_")
    
    # Remove .pdf extension if present and requested
    if remove_pdf and standardized.endswith('.pdf'):
        standardized = standardized[:-4]
        
    return standardized

def convert_to_space_format(title):
    """
    Convert underscore title to space format and remove .pdf
    """
    # Remove .pdf if present
    if title.endswith('.pdf'):
        title = title[:-4]
    # Replace underscores with spaces
    return title.replace("_", " ")

def merge_excel_files(file1_path, file2_path, output_path):
    """
    Merge two Excel files based on nos_title column with format matching
    """
    # Read both Excel files
    df1 = pd.read_excel(file1_path)
    df2 = pd.read_excel(file2_path)
    
    # Create standardized title columns for matching
    df1['standardized_title'] = df1['nos_title'].apply(lambda x: standardize_title(x, remove_pdf=False))
    df2['standardized_title'] = df2['nos_title'].apply(lambda x: standardize_title(x, remove_pdf=True))
    
    # Store the space-formatted titles from df1
    title_mapping = dict(zip(df1['standardized_title'], df1['nos_title']))
    
    # For any titles only in df2, convert them to space format
    df2_only_titles = set(df2['standardized_title']) - set(df1['standardized_title'])
    for title in df2_only_titles:
        original_title = df2.loc[df2['standardized_title'] == title, 'nos_title'].iloc[0]
        title_mapping[title] = convert_to_space_format(original_title)
    
    # Merge dataframes on standardized title
    merged_df = pd.merge(
        df1, 
        df2,
        left_on='standardized_title',
        right_on='standardized_title',
        how='outer',
        suffixes=('_1', '_2')
    )
    
    # Create single nos_title column using the mapping
    merged_df['nos_title'] = merged_df['standardized_title'].map(title_mapping)
    
    # Drop temporary and duplicate columns
    columns_to_drop = ['standardized_title', 'nos_title_1', 'nos_title_2']
    merged_df = merged_df.drop([col for col in columns_to_drop if col in merged_df.columns], axis=1)
    
    # Save to new Excel file
    merged_df.to_excel(output_path, index=False)
    
    # Print merge statistics
    print(f"File 1 rows: {len(df1)}")
    print(f"File 2 rows: {len(df2)}")
    print(f"Merged rows: {len(merged_df)}")
    
    return merged_df

# Usage example:
file1_path = "C:\\Users\\amith\\Kenpath\\latest_nos_data.xlsx"  # File with "Abrasive wheel operations in the wood industry" format
file2_path = "C:\\Users\\amith\\Kenpath\\parsed_nos(pdf+web).xlsx" # File with "Abrasive_wheel_operations_in_the_wood_industry.pdf" format
output_path = "C:\\Users\\amith\\Kenpath\\complete_nos.xlsx"
merged_df = merge_excel_files(file1_path, file2_path, output_path)

File 1 rows: 23976
File 2 rows: 22638
Merged rows: 24660


In [24]:
merged_df

Unnamed: 0,urn,suites,occupations,soc,developed_by_1,approved_on,web_link,pdf_link,overview,performance_criteria,knowledge_and_understanding,additional_information,developed_by_2,key_words,nos_title
0,ESKIDPS1L,IT Users 6.2,"Application Support, ICT for Users, Informatio...",3132,e-skills,15/03/2015 00:00,,https://files.ukstandards.org.uk/pdfs/ESKIDPS1...,,,,,,,2D Drawing and planning software LEGACY
1,ESKIDPS2L,IT Users 6.2,"Application Support, ICT for Users, Informatio...",3132,e-skills,15/03/2015 00:00,,https://files.ukstandards.org.uk/pdfs/ESKIDPS2...,,,,,,,2D Drawing and planning software LEGACY
2,ESKIDPS3L,IT Users 6.2,"Application Support, ICT for Users, Informatio...",3132,e-skills,15/03/2015 00:00,,https://files.ukstandards.org.uk/pdfs/ESKIDPS3...,,,,,,,2D Drawing and planning software LEGACY
3,PROSM09,Sawmilling,Paper and Wood Machine Operatives,8131,NSAFD v2,01/04/2020 00:00,,https://files.ukstandards.org.uk/pdfs/PROSM09.pdf,This standard covers the skills and knowledge ...,You must be able to: 1. carry out risk assessm...,You need to know and 1. relevant health and sa...,,NSAFD Version Number 2 Date Approved March 201...,Timber; wood; abrasive; wheel; saw; cut; sawmi...,Abrasive wheel operations in the wood industry
4,PPLRETRS17,Rail Engineering,Transport Drivers and Operatives,8153,Enginuity,15/03/2015 00:00,,https://files.ukstandards.org.uk/pdfs/PPLRETRS...,This unit is about accepting and returning res...,You must be able to: P1 work safely at all tim...,You need to know and K1 the relevant health an...,"Glossary Health and safety legislation, regula...",People 1st Version number 1 Date approved Apri...,Responsibility; control; handover PPLRETRS17 A...,"Accept, and return, responsibility for the con..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24655,COSVR414,Accessing Operations and Rigging (Construction),"Scaffolders, Stagers and Riggers",8151,CITB,08/05/2022 00:00,https://www.ukstandards.org.uk/en/nos-finder/C...,,"""This standard is about preparing, installing,...",You must be able to: P1 interpret the given in...,You need to know and understand: Performance C...,performance criteria Performance Criteria 1 1 ...,CITB Version Number 4 Date Approved 08 May 202...,Chimneys; Ducting; Sheet-metal cladding; Rivet...,install sheet metal cladding to chimneys and d...
24656,IMILVBP18,"Accident Repair - Joining, Maintenance and Rep...",Vehicle Trades,5232,IMI,15/03/2015 00:00,,https://files.ukstandards.org.uk/pdfs/IMILVBP1...,This NOS is about the straightforward removal ...,You must be able to: P1 use the appropriate pe...,You need to know and Legislative and organisat...,Scope/range 1. Basic MET components includes: ...,IMI Ltd Version number 1 Date approved January...,Basic Motor Mechanical Electrical Trim MET Com...,"remove and fit basic motor Mechanical, Electri..."
24657,ASTBSC1,Specialist Cleaning,Specialist Cleaning Occupations,9223,Asset Skills,15/03/2015 00:00,,https://files.ukstandards.org.uk/pdfs/ASTBSC1.pdf,This standard is about cleaning biohazard scen...,You must be able to: P1 assess the risks assoc...,You need to know and K1 how to assess risks at...,,Asset Skills Version number 1.0 Date approved ...,"biohazard scene, crime scene, pathogens, body ...",work at and clean a biohazard scene
24658,ASTH413L,Housing,"Health and Social Care, Health, Public Service...",1251,Asset Skills,15/01/2017 00:00,,https://files.ukstandards.org.uk/pdfs/ASTH413.pdf,This standard is about enabling the availabili...,You must be able to: P1 develop and manage sys...,You need to know and K1 the reasons for securi...,,Asset Skills Version number 2 Date approved Ja...,Availability; additional; homes; landlords AST...,work with the private sector to increase the a...


In [25]:
merged_df.isnull().sum()


urn                                1
suites                             1
occupations                        1
soc                              575
developed_by_1                     1
approved_on                       13
web_link                       16539
pdf_link                        8008
overview                         661
performance_criteria            3362
knowledge_and_understanding     3337
additional_information         12483
developed_by_2                   656
key_words                        690
nos_title                          0
dtype: int64