In [None]:
import json
import re
import pandas as pd

# Load the JSON data
with open("/home/ahmedabdullahi/NLP590/NLPJobsFinder/backend/data/companies.json", "r") as f:
    companies = json.load(f)

# Define remote keywords and a function to clean up location strings
remote_keywords = ["Remote", "Virtual", "Work from Home", "Telework", "Hybrid"]

def clean_location(location):
    """Cleans location strings for consistency in tagging."""
    if location:
        return location.replace(",", " , ").replace("/", " / ").strip()
    return ""

# Helper function to annotate entities

def annotate_entities(text, location):
    """
    Annotates text with BIO tags for CITY, COUNTRY, and REMOTE.
    Handles multi-word location names and excludes punctuation.
    """
    annotations = []
    location_parts = re.findall(r'\b\w+\b', location) if location else []  # Extract only words
    location_idx = 0  # Tracks position in the location_parts

    words = re.findall(r'\b\w+\b|[,./]', text)  # Split text into words and punctuation
    for word in words:
        if location_idx < len(location_parts) and word == location_parts[location_idx]:
            # Determine whether it's the start (B-) or continuation (I-)
            if location_idx == 0 or annotations[-1][1] == "O":
                # Start of a new entity
                if word.lower() in ["remote", "virtual", "telework", "work"]:
                    annotations.append((word, "B-REMOTE"))
                elif word.isupper():
                    annotations.append((word, "B-COUNTRY"))
                else:
                    annotations.append((word, "B-CITY"))
            else:
                # Continuation of the current entity
                if word.lower() in ["remote", "virtual", "telework", "work"]:
                    annotations.append((word, "I-REMOTE"))
                elif word.isupper():
                    annotations.append((word, "I-COUNTRY"))
                else:
                    annotations.append((word, "I-CITY"))
            location_idx += 1
        else:
            # Non-entity words or punctuation
            if re.match(r'\w+', word):  # Only tag non-punctuation as "O"
                annotations.append((word, "O"))
            else:
                annotations.append((word, "O"))  # Punctuation as "O"
            location_idx = 0  # Reset if there's a mismatch

    return annotations

# Prepare training data
training_data = []

for company in companies:
    location = clean_location(company.get("location", ""))
    description = company.get("description", "")
    combined_text = f"The job is located at {location}. {description}"
    
    # Annotate the combined text
    annotated_sentence = annotate_entities(combined_text, location)
    training_data.append(annotated_sentence)

# Convert to BIO format and save
bio_data = []
for sentence in training_data:
    for word, label in sentence:
        bio_data.append(f"{word}\t{label}")
    bio_data.append("")  # Blank line for sentence separation

# Save to a text file
with open("ner_training_data.txt", "w") as f:
    f.write("\n".join(bio_data))

print("NER training data has been saved to ner_training_data.txt")


In [1]:
import json
import re

def generate_ner_tags(entry):
    text = entry["location"] if "location" in entry else ""
    description = entry["description"] if "description" in entry else ""
    tokens, tags = [], []

    # Tokenize text
    words = re.split(r"(\W)", text)  # Split by non-word characters
    for word in words:
        if word.strip():
            tokens.append(word.strip())
            tags.append("O")  # Default to 'O'
    
    # Add tags for locations
    for loc in re.findall(r"[A-Za-z]+(?:[,\s/][A-Za-z]+)*", text):
        for part in loc.split():
            if part in tokens:
                tag = "B-CITY" if tags[tokens.index(part)] == "O" else "I-CITY"
                tags[tokens.index(part)] = tag
    
    # Add tags for 'Remote'
    if "Remote" in text or "Remote" in description:
        tokens.append("Remote")
        tags.append("B-REMOTE")
    
    # Tokenize and append description
    for word in re.split(r"(\W)", description):
        if word.strip():
            tokens.append(word.strip())
            tags.append("O")
    
    return tokens, tags

with open("/home/ahmedabdullahi/NLP590/NLPJobsFinder/Data/combined_location_description.json") as f:
    data = json.load(f)

# Generate training data
training_data = []
for entry in data:
    tokens, tags = generate_ner_tags(entry)
    training_data.append((tokens, tags))

# Save the training data
with open("ner_training_data1.txt", "w") as f:
    for tokens, tags in training_data:
        for token, tag in zip(tokens, tags):
            f.write(f"{token}\t{tag}\n")
        f.write("\n")


TypeError: string indices must be integers