In [1]:
import json
import pandas as pd

In [2]:
path = r"..\data\train_output.json"
json_data = []

with open(path, "r", encoding="utf-8") as file:
    for line in file:
        try:
            json_obj = json.loads(line)
            json_data.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"JSON parsing error: {e}")
            continue  # continues if it has an error

print(f"The number of data loaded: {len(json_data)}")

The number of data loaded: 11322


In [3]:
json_data[1]

{'Question': 'How do I get help finding a job?',
 'Answer': 'If you are a current job seeker or participant, this fact sheet provides important information about mutual obligation requirements, appointments with your provider, and what to do if you are self-isolating:  Information for job seekers and participants  If you are participating in the ParentsNext program, this fact sheet provides important information about your activities and appointments.   Information for ParentsNext participants   ParentsNext participants Frequently Asked Questions   If you are a New Business Assistance with NEIS participant, these Frequently Asked Questions (FAQ) provides information about accessing the Coronavirus Supplement and what support is available during this time:  New Business Assistance with NEIS participants - Frequently Asked Questions  If you are a New Business Assistance with NEIS provider, these Frequently Asked Questions (FAQ) provides information about supporting NEIS participants duri

In [4]:
def calculate_average_rating(json_data):
    # Mapping ratings to numeric values
    rating_values = {"Excellent": 4, "Acceptable": 3, "Could be Improved": 2, "Bad": 1}

    # Dictionary to store the sum of ratings and count for each answer
    answer_ratings = {}

    # Process each entry in the JSON data
    for entry in json_data:
        answer = entry["Answer"]
        rating = entry["Rating"]

        # Initialize the answer in the dictionary if not present
        if answer not in answer_ratings:
            answer_ratings[answer] = {"total": 0, "count": 0}

        # Add the numeric value of the rating to the total and increment count
        answer_ratings[answer]["total"] += rating_values[rating]
        answer_ratings[answer]["count"] += 1

    # Calculate the average rating for each answer
    average_ratings = {}
    for answer, data in answer_ratings.items():
        average_ratings[answer] = data["total"] / data["count"]

    return average_ratings

In [5]:
# Calculate and display the average ratings
average_ratings = calculate_average_rating(json_data)
average_ratings

{'If you are a current job seeker or participant, this fact sheet provides important information about mutual obligation requirements, appointments with your provider, and what to do if you are self-isolating:  Information for job seekers and participants  If you are participating in the ParentsNext program, this fact sheet provides important information about your activities and appointments.   Information for ParentsNext participants   ParentsNext participants Frequently Asked Questions   If you are a New Business Assistance with NEIS participant, these Frequently Asked Questions (FAQ) provides information about accessing the Coronavirus Supplement and what support is available during this time:  New Business Assistance with NEIS participants - Frequently Asked Questions  If you are a New Business Assistance with NEIS provider, these Frequently Asked Questions (FAQ) provides information about supporting NEIS participants during the Coronavirus situation.  New Business Assistance with

In [6]:
formatted_output_with_scores = []
rating_values = {"Excellent": 4, "Acceptable": 3, "Could be Improved": 2, "Bad": 1}

for entry in json_data:
    question = entry["Question"]
    answer = entry["Answer"]
    rating = entry["Rating"]
    score = rating_values.get(rating, 0)  # Get the numeric score for the rating
    average = average_ratings[answer]
    formatted_entry = {
        "Question": question,
        "Answer": answer,
        "Rating": rating,
        "Score": score,
        "Average": average,
    }
    formatted_output_with_scores.append(formatted_entry)

In [7]:
formatted_output_with_scores

[{'Question': 'How do I get help finding a job?',
  'Answer': 'If you are a current job seeker or participant, this fact sheet provides important information about mutual obligation requirements, appointments with your provider, and what to do if you are self-isolating:  Information for job seekers and participants  If you are participating in the ParentsNext program, this fact sheet provides important information about your activities and appointments.   Information for ParentsNext participants   ParentsNext participants Frequently Asked Questions   If you are a New Business Assistance with NEIS participant, these Frequently Asked Questions (FAQ) provides information about accessing the Coronavirus Supplement and what support is available during this time:  New Business Assistance with NEIS participants - Frequently Asked Questions  If you are a New Business Assistance with NEIS provider, these Frequently Asked Questions (FAQ) provides information about supporting NEIS participants du

In [8]:
df = pd.DataFrame(formatted_output_with_scores)
df

Unnamed: 0,Question,Answer,Rating,Score,Average
0,How do I get help finding a job?,If you are a current job seeker or participant...,Excellent,4,2.333333
1,How do I get help finding a job?,If you are a current job seeker or participant...,Could be Improved,2,2.333333
2,How do I get help finding a job?,In this rapidly changing jobs market the Austr...,Excellent,4,3.500000
3,How do I get help finding a job?,In this rapidly changing jobs market the Austr...,Excellent,4,3.500000
4,How do I get help finding a job?,To further assist job seekers to prepare for a...,Bad,1,2.700000
...,...,...,...,...,...
11317,Is it safe for me to manually replace my IUD a...,"If you do not want to become pregnant, you sho...",Bad,1,2.500000
11318,Is it safe for me to manually replace my IUD a...,No. Disposable medical face masks are intended...,Bad,1,1.666667
11319,Is it safe for me to manually replace my IUD a...,No. Disposable medical face masks are intended...,Bad,1,1.666667
11320,Is it safe for me to manually replace my IUD a...,Removal of long acting methods such as implant...,Excellent,4,4.000000


In [9]:
json_filename = (
    "../data/train_formatted_output.json"  # change the output path for each file
)
df.to_json(json_filename, orient="records", lines=True)

json_filename

'../data/train_formatted_output.json'