In [30]:
import os
import pandas as pd
import datetime
import re
from dateutil import parser

In [35]:
file = pd.read_json("result.jsonl", lines = True)
file["candidates"] = file["candidates"].apply(len)
file["job_title"] = file["job_title"].str.lower()
file["date_time"] = file["date_time"].str.lower()
file["location"] = file["location"].str.lower()
file["announcement"] = file["announcement"].str.lower()

def normalize_job(title):
    if not isinstance(title, str):
        return None
    
    t = title.lower()

    # remove location indicators like "- jigjiga", "- hawassa", etc.
    # (because location is already its own column)
    t = re.sub(r"-\s*[a-z ]+$", "", t)

    # remove meaningless prefixes
    t = re.sub(r"\bet[- ]?sponsored\b", "", t)
    t = re.sub(r"\bet\b", "", t)  # catches "ET-SPONSORED" or "ET "

    # remove trainee/junior/assistant
    t = re.sub(r"\btrainee\b", "", t)
    t = re.sub(r"\bjr\b", "", t)
    t = re.sub(r"\bjunior\b", "", t)
    t = re.sub(r"\bassistant\b", "", t)
    t = re.sub(r"\b(?![ac]\b)[a-z]\b", "", t)

    # remove applicant language
    t = re.sub(r"\bapplicant[s]?\b", "", t)

    # remove extra punctuation and spaces
    t = re.sub(r"[^a-z0-9/& ]+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()

    return t

file["job_title"] = file["job_title"].apply(normalize_job)

def normalize_announcement(text):
    text_lower = text.lower()
    if "interview" in text_lower:
        return "interview"
    elif "written" in text_lower:
        return "written exam"
    elif "employment" in text_lower:
        return "employment process"
    else:
        return text

file["announcement"]= file["announcement"].apply(normalize_announcement)

def normalize_location(text):
    text_lower= text.lower()
    if "mekelle" in text_lower:
        return "mekelle"
    elif "wollo" in text_lower:
        return "wollo"
    elif "gondar" in text_lower:
        return "gonder"
    elif "gonder" in text_lower:
        return "gonder"
    elif "haramaya" in text_lower:
        return "harar"
    elif "jigjiga" in text_lower:
        return "jigjiga"
    elif "hawassa" in text_lower:
        return "hawassa"
    elif "nekemete" in text_lower:
        return "wollega"
    elif "madda" in text_lower:
        return "robe"
    elif "semera" in text_lower:
        return "semera"
    elif "arbaminch" in text_lower:
        return "arbaminch"
    elif "bahir" in text_lower:
        return "bahir dar"
    elif "dire" in text_lower:
        return "dire dawa"
    elif "gambella" in text_lower:
        return "gambella"
    elif "shashemene" in text_lower:
        return "shashemene"
    elif "addis" in text_lower:
        return "addis ababa"
    elif "adama" in text_lower:
        return "adama"
    elif "wolkite" in text_lower:
        return "wolkite"
    elif "ambo" in text_lower:
        return "ambo"
    elif "ethiopian" in text_lower:
        return "addis ababa"
    elif "gode" in text_lower:
        return "gode"
    elif "goba" in text_lower:
        return "robe"
    elif "jimma" in text_lower:
        return "jimma"
    elif "assosa" in text_lower:
        return "assosa"
    elif "kabri" in text_lower:
        return "kabri dahar"
    else:
        return text

file["location"]=file["location"].apply(normalize_location)

def clean_date_time(text):
    try:
        return parser.parse(str(text), fuzzy=True)
    except:
        return None

file["date_time"]= file["date_time"].apply(clean_date_time)


file.to_csv('result_ii.csv', index = False)

                                            job_title  \
0                                   tour guide driver   
1   et-sponsored trainee cabin crew applicant’s- d...   
2   et-sponsored trainee cabin crew applicant’s- g...   
3   et-sponsored trainee cabin crew applicant’s- m...   
4   et-sponsored trainee cabin crew applicant’s- h...   
5                         fire and rescue i - jigjiga   
6                         fire and rescue i - hawassa   
7   et-sponsored trainee cabin crew applicant’s- j...   
8   et-sponsored trainee cabin crew applicant’s- j...   
9   et-sponsored trainee cabin crew applicant’s- n...   
10  et-sponsored trainee cabin crew applicant’s- robe   
11  et-sponsored trainee cabin crew applicant’s- s...   
12  et-sponsored trainee cabin crew applicant’s- a...   
13  et-sponsored trainee cabin crew applicant’s- a...   
14  et-sponsored trainee cabin crew applicant’s- b...   
15  et-sponsored trainee cabin crew applicant’s- d...   
16  et-sponsored trainee cabin 