In [2]:
import os
import pandas as pd
import datetime
import re
from dateutil import parser

In [4]:
path_file =(r"C:\Users\hp\Desktop\ET-Airlines\result_list\result.jsonl")
file = pd.read_json(path_file, lines = True)
file["candidates"] = file["candidates"].apply(len)
file["job_title"] = file["job_title"].str.lower()
file["date_time"] = file["date_time"].str.lower()
file["location"] = file["location"].str.lower()
file["announcement"] = file["announcement"].str.lower()

def trainee(title):
    if not isinstance(title, str):
        return None
    is_trainee = title.lower()

    # check if the title containes any 'trainee' in it
    # If it contains that string then it is announcment for trainee's 
    # We will create a column called title then we will assign 'trainee' or 'job' applicants
    if "trainee" in is_trainee:
        return "trainee"
    else:
        return "job"

file["position"] = file["job_title"].apply(trainee)

def normalize_titles(title):
    if not isinstance(title, str):
        return None
    
    t = title.lower()

    # remove location indicators like "- jigjiga", "- hawassa", etc.
    # (because location is already its own column)
    t = re.sub(r"-\s*[a-z ]+$", "", t)

    # remove meaningless prefixes
    t = re.sub(r"\bet[- ]?sponsored\b", "", t)
    t = re.sub(r"\bet\b", "", t)  # catches "ET-SPONSORED" or "ET "

    # remove trainee/junior/assistant
    t = re.sub(r"\btrainee\b", "", t)
    t = re.sub(r"\bjr\b", "", t)
    t = re.sub(r"\bjunior\b", "", t)
    t = re.sub(r"\bassistant\b", "", t)
    t = re.sub(r"\b(?![ac]\b)[a-z]\b", "", t)

    # remove applicant language
    t = re.sub(r"\bapplicant[s]?\b", "", t)

    # remove extra punctuation and spaces
    t = re.sub(r"[^a-z0-9/& ]+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()

    return t

file["job_title"] = file["job_title"].apply(normalize_titles)

def normalize_announcement(text):
    text_lower = text.lower()
    if "interview" in text_lower:
        return "interview"
    elif "written" in text_lower:
        return "written exam"
    elif "employment" in text_lower:
        return "employment process"
    elif "practical" in text_lower:
        return "practical exam"
    else:
        return text

file["announcement"]= file["announcement"].apply(normalize_announcement)

def normalize_location(text):
    if not isinstance(text, str):
        return None
    t = re.sub(r"[^a-z0-9\s]", " ", text.lower()).strip()
    t = re.sub(r"\s+", " ", t)

    # canonicalize some common city spellings/aliases
    aliases = {
        "gonder": "gondar",
        "bahir": "bahir dar",
        "haramaya": "harar",
        "addis": "addis ababa",
        "madda": "robe",
        "goba": "robe",
        "nekemte": "nekemte",
        "wollega": "nekemte",
        "kebridehar": "kebri dehar",
        "kabri": "kebri dehar",
        "dire": "dire dawa",
        "semera": "semera",
        "arbaminch": "arba minch",
        "arba minch":"arba minch",
        "ethiopian":"addia ababa"
    }
    for a, canon in aliases.items():
        if a in t:
            return canon
        # fallback: return the cleaned token containing known city names
    known_cities = [
        "mekelle","dessie","gondar","harar","jigjiga","hawassa","nekemte",
        "robe","semera","arba minch","bahir dar","dire dawa","gambella",
        "shashemene","addis ababa","adama","wolkite","ambo","gode","jimma",
        "assosa","kebri dehar"
    ]
    for city in known_cities:
        if city in t:
            return city

    return t

file["location"]=file["location"].apply(normalize_location)

def clean_date_time(text):
    try:
        return parser.parse(str(text), fuzzy=True)
    except:
        return None

file["date_time"]= file["date_time"].apply(clean_date_time)

def map_region(location):
    if not isinstance(location, str):
        return None
    l = location.lower()

    region_map = [
        (["mekelle"], "tigray"),
        (["wollo","dessie","gondar","bahir dar"], "amhara"),
        (["nekemte","adama","robe","goba","shashemene","gode","jimma","ambo","kebri dehar","kebridehar"], "oromia"),
        (["hawassa"], "sidama"),
        (["addis ababa","ethiopian"], "addis ababa"),
        (["dire dawa","dire"], "dire dawa"),
        (["jigjiga","kebri dehar","kebridehar"], "somali"),
        (["gambella"], "gambella"),
        (["assosa"], "benishangul-gumuz"),
        (["harar","haramaya"], "harari"),
        (["semera"], "afar"),
        (["arba minch","wolkite"], "snnpr")
    ]

    for keys, region in region_map:
        for key in keys:
            if key in l:
                return region
    return "other"
file["region"] = file["location"].apply(map_region)

path_saving = (r"C:\Users\hp\Desktop\practices\0_Tabelau_analysis_files\ET_Airlines\FINAL CSV\result.csv")

file.to_csv(path_saving, index = False)

  file = pd.read_json(path_file, lines = True)
  file = pd.read_json(path_file, lines = True)
  file = pd.read_json(path_file, lines = True)
