In [1]:
import pandas as pd
import re
import json

# Load the JSON data
file_path = "legalad_posts.json"
output_path = "cleaned_data.json"

# Read JSON file into a Pandas DataFrame
df = pd.read_json(file_path)

# Function to clean text fields
def clean_text(text):
    if not isinstance(text, str):
        return text  # Return as is if it's not a string
    
    text = text.strip()  # Remove leading/trailing whitespace
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[^\w\s.,!?\'"-]', '', text)  # Remove unwanted special characters (except punctuation)
    return text

# Apply text cleaning to relevant fields
text_columns = ["title", "text"]  # Assuming these are the text columns in the dataset
for col in text_columns:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)

# Convert numerical fields to proper data types
numeric_columns = ["score", "num_comments"]  # Modify based on actual numeric fields
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")  # Convert to numbers, set errors to NaN

# Handle missing values
df.fillna({"title": "Untitled", "text": "No content available"}, inplace=True)

# Save cleaned data to a new JSON file
df.to_json(output_path, orient="records", indent=4)

# Output file path
output_path


'cleaned_data.json'