In [4]:

import json
import csv
import ijson
import uuid
from pyspark.sql import SparkSession
import random

In [None]:
import ijson
import json
import uuid
import random
from decimal import Decimal

# Path to the input and output JSON files
input_file = "after_merged.json"
output_file = "expanded.json"

# Function to add noise to numerical values
def add_noise(value, scale=0.1, min_value=0, max_value=100):
    if isinstance(value, (int, float)):
        noise = random.uniform(-scale, scale) * value
        noisy_value = value + noise
        # Clamp the value within the specified range
        return max(min_value, min(max_value, round(noisy_value, 6)))
    return value

# Function to generate a new record with (Remix)
def generate_remix_record(record):
    new_record = record.copy()
    new_record["Track ID"] = str(uuid.uuid4())  # Generate a new Track ID
    new_record["Track Name"] = f"{record['Track Name']} (Remix)"  # Add (Remix)
    new_record["Popularity"] = add_noise(record["Popularity"], scale=10, min_value=0, max_value=100)
    new_record["Energy"] = add_noise(record["Energy"], scale=0.05)
    new_record["Tempo"] = add_noise(record["Tempo"], scale=2)
    return new_record

# Function to generate a new record with (Lofi)
def generate_lofi_record(record):
    new_record = record.copy()
    new_record["Track ID"] = str(uuid.uuid4())  # Generate a new Track ID
    new_record["Track Name"] = f"{record['Track Name']} (Lofi)"  # Add (Lofi)
    new_record["Popularity"] = add_noise(record["Popularity"], scale=5, min_value=0, max_value=100)
    new_record["Energy"] = add_noise(record["Energy"], scale=0.03)
    new_record["Tempo"] = add_noise(record["Tempo"], scale=1.5)
    return new_record

# Custom JSON encoder to handle Decimal objects
class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Decimal):
            return float(obj)  # Convert Decimal to float
        return super().default(obj)

# Process the JSON file using ijson
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    # Start the JSON array
    outfile.write("[\n")
    items = ijson.items(infile, "item")
    first = True  # Track if it's the first item

    for item in items:
        # Write the original record
        if not first:
            outfile.write(",\n")
        json.dump(item, outfile, indent=4, ensure_ascii=False, cls=CustomJSONEncoder)

        # Write the new (Remix) record
        outfile.write(",\n")
        remix_record = generate_remix_record(item)
        json.dump(remix_record, outfile, indent=4, ensure_ascii=False, cls=CustomJSONEncoder)

        # Check if the original Track Name doesn't contain (Remix) and add (Lofi)
        if "(Remix)" not in item["Track Name"]:
            outfile.write(",\n")
            lofi_record = generate_lofi_record(item)
            json.dump(lofi_record, outfile, indent=4, ensure_ascii=False, cls=CustomJSONEncoder)

        first = False

    # End the JSON array
    outfile.write("\n]")

print(f"Expanded data saved to {output_file}")


In [None]:
file_path  = "expanded.json"  

def extract_keys(obj, prefix=""):
    keys = []
    for key, value in obj.items():
        full_key = f"{prefix}.{key}" if prefix else key
        if isinstance(value, dict):
            keys.extend(extract_keys(value, prefix=full_key))
        else:
            keys.append(full_key)
    return keys

with open(file_path, "r", encoding="utf-8") as file:
    items = ijson.items(file, "item")
    first_item = next(items, None)
    
    if first_item:
        columns = extract_keys(first_item)
        print("Columns:", columns)
    else:
        print("The JSON file is empty or not structured as an array of objects.")

In [None]:
import ijson
import csv

# Path to the input JSON and output CSV files
input_file = "expanded.json"
output_file = "output.csv"

# Function to extract keys
def extract_keys(obj, prefix=""):
    keys = []
    for key, value in obj.items():
        full_key = f"{prefix}.{key}" if prefix else key
        if isinstance(value, dict):
            keys.extend(extract_keys(value, prefix=full_key))
        else:
            keys.append(full_key)
    return keys

# Open the JSON file for reading
with open(input_file, "r", encoding="utf-8") as json_file:
    # Open the CSV file for writing
    with open(output_file, "w", newline="", encoding="utf-8") as csv_file:
        # Parse the first item to extract headers
        items = ijson.items(json_file, "item")
        first_item = next(items, None)

        if first_item:
            # Get the headers (keys of the JSON object)
            headers = extract_keys(first_item)

            # Initialize CSV writer
            writer = csv.DictWriter(csv_file, fieldnames=headers)

            # Write the header row
            writer.writeheader()

            # Write the first item
            writer.writerow(first_item)

            # Write the rest of the items
            for item in items:
                writer.writerow(item)

print(f"JSON data has been written to {output_file}")


In [None]:
import pandas as pd
df = pd.read_csv("output.csv")
df.columns

In [None]:
spark = SparkSession.builder.appName("Music").getOrCreate()
df = spark.read.csv("output.csv", header=True)
df.show()