In [1]:
import json

# Load the JSON data from file
with open('2016_2023_PLS.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

### Rensar bort onödiga under-etiketter

In [None]:
# Delete key/value-pairs of (must_have & nice_to_have) concept_id and legacy_ams_taxonomy_id from all objects
# Also deletes some key/value-pairs from application_contact ("email", "telephone", "contact_type")
def remove_fields(obj):
    if isinstance(obj, dict):
        keys_to_delete = []
        for key in obj.keys():
            if key in ["concept_id", "legacy_ams_taxonomy_id", "weight", "email", "telephone", "contact_type"]:
                keys_to_delete.append(key)
            else:
                remove_fields(obj[key])
        for key in keys_to_delete:
            del obj[key]
    elif isinstance(obj, list):
        for item in obj:
            remove_fields(item)

remove_fields(data)

### Plattar ut genom att "hämta ut" underetiketter

In [None]:
# Flatten the data where there are several layers (dictionary/array)
for obj in data:
    new_obj = {}
    for key, value in obj.items():
        if isinstance(value, list) and value == []:
            new_obj[key] = None
        elif key == "workplace_address.coordinates":
            new_obj[key] = value
            continue
        elif isinstance(value, list):
            new_values = []
            for item in value:
                if isinstance(item, dict) and 'label' in item:
                    new_values.append(item['label'])
                elif item is None:
                    new_values.append(None)
            new_obj[key] = new_values[0] if len(new_values) == 1 else new_values
        elif isinstance(value, dict):
            for sub_key, sub_value in value.items():
                new_obj[f"{key}_{sub_key}"] = sub_value['label'] if sub_value is not None and 'label' in sub_value else None
        else:
            new_obj[key] = value

    obj.clear()
    obj.update(new_obj)

### Tar bort alla kolumner utom de som angivs i "keys_to_keep"

In [5]:
# Rensa ut alla kolumner utan de som finns angivna i "keys_to_keep"
keys_to_keep = [
    "publication_date",
    "description_text",
    "occupation_label",
    "occupation_group_label",
    "Bad_words"
]

for obj in data:
    new_obj = {}
    for key, value in obj.items():
        if key in keys_to_keep:
            new_obj[key] = value

    obj.clear()
    obj.update(new_obj)

### Skriv kvarvarande till ny json-fil

In [6]:
# Save the filtered data to a new JSON file
with open('Hela_datan.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

### OBS! Rensar description_text från ord som inte förekommer i ordlistan (bra och dåliga ord) (OBS: Körs enbart efter vi gjort sentiment analys, word2vec och fått ut cosine similarity)

In [None]:
import pandas as pd
import re

# Load the JSON file into a DataFrame
df = pd.read_json('2016_2023_PLS.json', encoding='utf-8')

# List of words to count occurrences for
target_words = []

with open("ordlista.txt", "r", encoding='utf-8') as file:
    lines = file.readlines()

for line in lines:
    words = line.split()
    for word in words:
        target_words.append(word)

# Process job ads
for index, row in df.iterrows():
    ad_text = row['description_text']
    
    # Check if any target word is present in the description_text
    found_target_words = []
    for target_word in target_words:
        if re.search(r'\b{}\b'.format(target_word), ad_text, flags=re.IGNORECASE):
            found_target_words.append(target_word)
    
    # Replace the description_text with found target words or 'Bra_annons'
    if found_target_words:
        repeated_words = []
        for target_word in found_target_words:
            count = len(re.findall(r'\b{}\b'.format(target_word), ad_text, flags=re.IGNORECASE))
            repeated_words.extend([target_word] * count)
        ad_text = ' '.join(repeated_words)
    else:
        ad_text = 'Bra_annons'
    
    df.at[index, 'description_text'] = ad_text

# Save the modified DataFrame to a new JSON file
df.to_json('Full_dataset.json', orient='records', force_ascii=False, indent=4)


### Konvertera json till csv

In [2]:
import pandas as pd
import json

# Load the JSON data from file
with open('Full_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Create a DataFrame from the JSON data
df = pd.DataFrame(data)

# Save the DataFrame to CSV with utf-8-sig encoding
df.to_csv('Full_dataset.csv', index=False, encoding='utf-8-sig')
