In [1]:
import os
import json

In [2]:
def extract_relevant_data(input_folder: str, output_folder: str):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    for file_name in os.listdir(input_folder):
        if file_name.endswith(".json"):
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            try:
                # Read the input JSON file
                with open(input_file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

                # Extract relevant fields
                relevant_data = {
                    "city": data.get("city", {}),
                    #"country_code": data.get("country_code", {}),
                    "position": data.get("position", {}),
                    #"current_company": data.get("current_company", {}),
                    "experience": data.get("experience", []),
                    "education": data.get("education", []),
                    "courses": data.get("courses", []),
                    "certifications": data.get("certifications", []),
                    #"current_company_id": data.get("current_company_id", []),
                    "current_company_name": data.get("current_company_name", []),
                    "publications": data.get("publications", []),
                    "patents": data.get("patents", []),
                    "projects": data.get("projects", []),
                    #"organizations": data.get("organizations", []),
                    #"location": data.get("location", []),
                    "honors_and_awards": data.get("honors_and_awards", []),
                }

                # Write the relevant data to a new JSON file
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    json.dump(relevant_data, output_file, indent=4, ensure_ascii=False)

                print(f"Processed {file_name} successfully.")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

In [3]:
input_folder = "json_files"
output_folder = "processed_files"
extract_relevant_data(input_folder, output_folder)

Processed 123preet-gill.json successfully.
Processed 1996-salman-01-khan.json successfully.
Processed 1pranjal1-patil.json successfully.
Processed 2sharkk.json successfully.
Processed 42shiv.json successfully.
Processed 750730117.json successfully.
Processed 7fansari.json successfully.
Processed a4sh.json successfully.
Processed aa25desh.json successfully.
Processed aabhaysingh.json successfully.
Processed aadesh09.json successfully.
Processed aadeshk.json successfully.
Processed aadhar-kansal.json successfully.
Processed aadhar-kaul.json successfully.
Processed aadilshah.json successfully.
Processed aaditya-sanjay-b-a62630a0.json successfully.
Processed aaditya-shah-22a742205.json successfully.
Processed aamir-wahid-723ab8a5.json successfully.
Processed aamir-warsi-a4386396.json successfully.
Processed aanchal-dua-5b3197102.json successfully.
Processed aaradhya-srivastava-3b28b5259.json successfully.
Processed aarushi-jain96.json successfully.
Processed aarushi-kochhar-696404188.json 

In [2]:
def convert_field(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".json"):  # Process only JSON files
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)  # Save with the same filename
            
            try:
                # Read the input file
                with open(input_path, "r") as infile:
                    data = json.load(infile)
                
                # Process the "courses" field
                if "courses" in data and isinstance(data["courses"], str):
                    data["courses"] = json.loads(data["courses"])
                
                # Write the processed data to the output file
                with open(output_path, "w") as outfile:
                    json.dump(data, outfile, indent=4)
                
                print(f"Processed: {filename}")
            
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
            try:
                # Read the input file
                with open(input_path, "r") as infile:
                    data = json.load(infile)
                
                # Process the "courses" field
                if "projects" in data and isinstance(data["projects"], str):
                    data["projects"] = json.loads(data["projects"])
                
                # Write the processed data to the output file
                with open(output_path, "w") as outfile:
                    json.dump(data, outfile, indent=4)
                
                print(f"Processed: {filename}")
            
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

In [3]:
input_folder = "recsys_llm/processed_data"  # Replace with your input folder path
output_folder = "final_user_profiles"  # Replace with your output folder path
convert_field(input_folder, output_folder)

Processed: 123preet-gill.json
Processed: 123preet-gill.json
Processed: 1996-salman-01-khan.json
Processed: 1996-salman-01-khan.json
Error processing file 1pranjal1-patil.json: 'charmap' codec can't decode byte 0x9d in position 2771: character maps to <undefined>
Error processing file 1pranjal1-patil.json: 'charmap' codec can't decode byte 0x9d in position 2771: character maps to <undefined>
Processed: 2sharkk.json
Processed: 2sharkk.json
Processed: 42shiv.json
Processed: 42shiv.json
Processed: 750730117.json
Processed: 750730117.json
Processed: 7fansari.json
Processed: 7fansari.json
Processed: a4sh.json
Processed: a4sh.json
Processed: aa25desh.json
Processed: aa25desh.json
Processed: aabhaysingh.json
Processed: aabhaysingh.json
Processed: aadesh09.json
Processed: aadesh09.json
Processed: aadeshk.json
Processed: aadeshk.json
Processed: aadhar-kansal.json
Processed: aadhar-kansal.json
Error processing file aadhar-kaul.json: 'charmap' codec can't decode byte 0x8f in position 11505: charac