In [1]:

import pandas as pd
import json

# Load your data
csv_file_path = 'metadata_v2.csv'
json_file_path = 'uniformat_in_builder_reduced.json'
data = pd.read_csv(csv_file_path)
with open(json_file_path, 'r') as json_file:
    codes_data = json.load(json_file)



In [44]:
reduced_categories_with_codes = {
    "DOMESTIC WATER EQUIPMENT": "D202003",
    "COMPRESSED AIR SYSTEM (NON-BREATHING)": "D209005",
    "OTHER SPECIAL PLUMBING SYSTEMS": "D209090",
    "OIL SUPPLY SYSTEM": "D301001",
    "GAS SUPPLY SYSTEM": "D301002",
    "STEAM SUPPLY SYSTEM (FROM CENTRAL PLANT)": "D301003",
    "SOLAR ENERGY SUPPLY SYSTEMS": "D301005",
    "WIND ENERGY SUPPLY SYSTEM": "D301006",
    "OTHER ENERGY SUPPLY": "D301090",
    "BOILERS": "D302001",
    "FURNACES": "D302002",
    "FUEL-FIRED UNIT HEATERS": "D302003",
    "AUXILIARY EQUIPMENT": "D302004",
    "OTHER HEAT GENERATING SYSTEMS": "D302090",
    "CHILLED WATER SYSTEMS": "D303001",
    "DIRECT EXPANSION SYSTEMS": "D303002",
    "AIR DISTRIBUTION, HEATING & COOLING": "D304001",
    "STEAM DISTRIBUTION SYSTEMS": "D304002",
    "HOT WATER DISTRIBUTION SYSTEMS": "D304003",
    "GLYCOL DISTRIBUTION SYSTEMS": "D304005",
    "CHILLED WATER DISTRIBUTION SYSTEMS": "D304006",
    "EXHAUST SYSTEMS": "D304007",
    "AIR HANDLING UNITS": "D304008",
    "OTHER DISTRIBUTION SYSTEMS": "D304090",
    "UNIT VENTILATORS": "D305001",
    "UNIT HEATERS": "D305002",
    "FAN COIL UNITS": "D305003",
    "FIN TUBE RADIATION": "D305004",
    "ELECTRIC HEATING": "D305005",
    "PACKAGE UNITS": "D305006",
    "OTHER TERMINAL & PACKAGE UNITS": "D305090",
    "HVAC CONTROLS": "D306001",
    "ELECTRONIC CONTROLS": "D306002",
    "PNEUMATIC CONTROLS": "D306003",
    "INSTRUMENT AIR COMPRESSORS": "D306004",
    "GAS PURGING SYSTEMS": "D306005",
    "OTHER CONTROLS INSTRUMENTATION": "D306090",
    "REFRIGERATION SYSTEMS": "D309002",
    "LIGHTING EQUIPMENT": "D502002",
    "EMERGENCY LIGHTING & POWER": "D509002",
    "LIGHTNING PROTECTION": "D509004",
    "ELECTRIC HEATING": "D509005",
    "ENERGY MANAGEMENT CONTROL SYSTEM": "D509006"
}

In [41]:
def find_matching_categories(data, categories):
    """
    Find matching categories for each row in the data based on keywords.

    Parameters:
    - data: DataFrame with the project information.
    - categories: Dictionary of categories with their codes.

    Returns:
    - List of dictionaries with matched categories and project details.
    """
    output = []

    # Helper function to check if a keyword matches a category
    def keyword_matches_category(keyword, category):
        keyword_parts = set(keyword.lower().split())
        category_parts = set(category.lower().split())
        return keyword_parts == category_parts or len(keyword_parts & category_parts) >= 2

    # Iterate through each project in the dataframe
    for index, row in data.iterrows():
        if pd.isna(row['Keywords']):
            continue  # Skip rows without keywords
        keywords = row['Keywords'].split(', ')
        project = {"Title": row['Title'], "URL": row['Canonical Link']}
        
        # Check each category to see if there's a match with the project's keywords
        for category, code in categories.items():
            for keyword in keywords:
                if keyword_matches_category(keyword, category):
                    # Find or create the category entry in the output
                    category_entry = next((item for item in output if item['code'] == code), None)
                    if category_entry:
                        category_entry['titles'].append(project)
                    else:
                        output.append({
                            "description": category,
                            "code": code,
                            "titles": [project]
                        })
                    break  # Stop checking other keywords once a match is found

    return output

# Apply the function to find matching categories for the projects
matched_categories = find_matching_categories(data, reduced_categories_with_codes)

# Display the first few matched categories for verification
#matched_categories[:2]  # Let's display a subset for brevity


In [46]:
matched_categories_keyword = matched_categories
matched_categories_keyword

[{'description': 'ENERGY MANAGEMENT CONTROL SYSTEM',
  'code': 'D509006',
  'titles': [{'Title': 'Wireless Platform for Energy-Efficient Building Control Retrofits',
    'URL': 'https://serdp-estcp.mil/projects/details/1130b09f-f864-4478-aa85-55dc0636f204'},
   {'Title': 'Scalable Deployment of Advanced Building Energy Management Systems',
    'URL': 'https://serdp-estcp.mil/projects/details/d8445d89-4c7e-4406-b50a-aac5df1e1a37'},
   {'Title': 'Energy Performance Monitoring and Optimization System for DoD Campuses',
    'URL': 'https://serdp-estcp.mil/projects/details/625a70d1-a751-4889-b41c-997191a79f41'},
   {'Title': 'Advanced Micro-Grid Energy Management Coupled with Integrated Volt/VAR Control for Improved Energy Efficiency, Energy Security, and Power Quality at DoD Installations',
    'URL': 'https://serdp-estcp.mil/projects/details/2d5092f7-9191-4503-affe-8f5c80d21cc6'},
   {'Title': 'Advanced Micro-Grid Energy Management Coupled with Integrated Volt/VAR Control for Improved Ene

In [52]:
def find_strict_consecutive_word_matches_description(data, categories):
    """
    Find matching categories for each row in the data based on descriptions requiring exact
    matches of at least two consecutive words, exactly as they appear in the category descriptions.

    Parameters:
    - data: DataFrame with the project information.
    - categories: Dictionary of categories with their codes.

    Returns:
    - List of dictionaries with matched categories and project details.
    """
    output = []

    # Helper function to find exact matches of at least two consecutive words in description
    def has_exact_consecutive_match(description, category):
        description_words = description.lower()
        for category_phrase in category.lower().split(", "):  # Check for phrases within categories
            # Check for exact match of the phrase in the description
            if f" {category_phrase} " in f" {description_words} ":
                return True
        return False

    # Iterate through each project in the dataframe
    for index, row in data.iterrows():
        if pd.isna(row['Description']):
            continue  # Skip rows without descriptions
        project = {"Title": row['Title'], "URL": row['Canonical Link']}
        
        # Check each category to see if there's an exact consecutive word match with the project's description
        for category, code in categories.items():
            if has_exact_consecutive_match(row['Description'], category):
                # Find or create the category entry in the output
                category_entry = next((item for item in output if item['code'] == code), None)
                if category_entry:
                    category_entry['titles'].append(project)
                else:
                    output.append({
                        "description": category,
                        "code": code,
                        "titles": [project]
                    })
                break  # Stop checking other categories once a match is found

    return output

# Apply the function with the updated exact consecutive words matching criteria
strict_matched_categories = find_strict_consecutive_word_matches_description(data, reduced_categories_with_codes)

# Display the first few matched categories for verification
strict_matched_categories[:2]  # Let's display a subset for brevity


[{'description': 'FURNACES',
  'code': 'D302002',
  'titles': [{'Title': 'A Quantum Chemical - Machine Learning Approach for the Prediction of Thermal PFAS Destruction',
    'URL': 'https://serdp-estcp.mil/projects/details/f7aa8a9a-e8e2-46cc-ae24-e7cef899ffd5'}]},
 {'description': 'ENERGY MANAGEMENT CONTROL SYSTEM',
  'code': 'D509006',
  'titles': [{'Title': 'Converged Energy Management Control System',
    'URL': 'https://serdp-estcp.mil/projects/details/8231ba53-0c6d-4a86-81ff-2f9fc1bb270b'}]}]

In [53]:
def find_flexible_consecutive_word_matches_description(data, categories):
    """
    Find matching categories for each row in the data based on descriptions requiring at least
    the first two words of the category to be present consecutively in the project descriptions.

    Parameters:
    - data: DataFrame with the project information.
    - categories: Dictionary of categories with their codes.

    Returns:
    - List of dictionaries with matched categories and project details.
    """
    output = []

    # Helper function to find matches based on the first two words of the category
    def has_flexible_consecutive_match(description, category):
        description_lower = description.lower()
        # Extract the first two words of the category for matching
        category_words = category.lower().split()
        if len(category_words) >= 2:
            # Create a pattern for at least the first two words
            pattern = " ".join(category_words[:2])
            if pattern in description_lower:
                return True
        else:
            # If the category has less than two words, just check for its presence
            if category.lower() in description_lower:
                return True
        return False

    # Iterate through each project in the dataframe
    for index, row in data.iterrows():
        if pd.isna(row['Description']):
            continue  # Skip rows without descriptions
        project = {"Title": row['Title'], "URL": row['Canonical Link']}
        
        # Check each category to see if the first two words match with the project's description
        for category, code in categories.items():
            if has_flexible_consecutive_match(row['Description'], category):
                # Find or create the category entry in the output
                category_entry = next((item for item in output if item['code'] == code), None)
                if category_entry:
                    category_entry['titles'].append(project)
                else:
                    output.append({
                        "description": category,
                        "code": code,
                        "titles": [project]
                    })
                break  # Stop checking other categories once a match is found

    return output

# Apply the function with the updated flexible consecutive words matching criteria
flexible_matched_categories = find_flexible_consecutive_word_matches_description(data, reduced_categories_with_codes)

# Display the first few matched categories for verification
flexible_matched_categories[:2]  # Let's display a subset for brevity


[{'description': 'OTHER HEAT GENERATING SYSTEMS',
  'code': 'D302090',
  'titles': [{'Title': 'In Situ Thermal Remediation of DNAPL Source Zones',
    'URL': 'https://serdp-estcp.mil/projects/details/62d59c56-e1ca-4a69-a2a0-5cc85a662cea'},
   {'Title': 'In Situ Thermal Remediation of DNAPL Source Zones',
    'URL': 'https://serdp-estcp.mil/projects/details/62d59c56-e1ca-4a69-a2a0-5cc85a662cea'}]},
 {'description': 'SOLAR ENERGY SUPPLY SYSTEMS',
  'code': 'D301005',
  'titles': [{'Title': 'A Self-Sustaining Solar-Bio-Nano-Based Wastewater Treatment System for Forward Operating Bases',
    'URL': 'https://serdp-estcp.mil/projects/details/5b49b555-0a81-4033-a040-8de2d1caf42a'},
   {'Title': 'Low Energy Baffled Bioreactor-Based Water Re-use System for Energy and Water Security',
    'URL': 'https://serdp-estcp.mil/projects/details/67914324-01cd-4687-8399-e3f4e3987a79'},
   {'Title': 'A Liquid-Desiccant Outdoor Air Conditioner for Low-Electricity Humidity Control',
    'URL': 'https://serdp

In [54]:
flexible_matched_categories

[{'description': 'OTHER HEAT GENERATING SYSTEMS',
  'code': 'D302090',
  'titles': [{'Title': 'In Situ Thermal Remediation of DNAPL Source Zones',
    'URL': 'https://serdp-estcp.mil/projects/details/62d59c56-e1ca-4a69-a2a0-5cc85a662cea'},
   {'Title': 'In Situ Thermal Remediation of DNAPL Source Zones',
    'URL': 'https://serdp-estcp.mil/projects/details/62d59c56-e1ca-4a69-a2a0-5cc85a662cea'}]},
 {'description': 'SOLAR ENERGY SUPPLY SYSTEMS',
  'code': 'D301005',
  'titles': [{'Title': 'A Self-Sustaining Solar-Bio-Nano-Based Wastewater Treatment System for Forward Operating Bases',
    'URL': 'https://serdp-estcp.mil/projects/details/5b49b555-0a81-4033-a040-8de2d1caf42a'},
   {'Title': 'Low Energy Baffled Bioreactor-Based Water Re-use System for Energy and Water Security',
    'URL': 'https://serdp-estcp.mil/projects/details/67914324-01cd-4687-8399-e3f4e3987a79'},
   {'Title': 'A Liquid-Desiccant Outdoor Air Conditioner for Low-Electricity Humidity Control',
    'URL': 'https://serdp