# Category Analysis in JSON Files

This script analyzes categories from JSON files, counts their occurrences, and visualizes the results.

In [None]:
# Install necessary libraries
%pip install matplotlib numpy

## Import Libraries

In [2]:
import json
import os
import numpy as np
from typing import Dict
import matplotlib.pyplot as plt
from collections import Counter

## Data Processing Functions

In [3]:
def process_file(file_path: str) -> Counter:
    """
    Counts the occurrences of categories in a JSON file.

    :param file_path: The path to the JSON file to be analyzed.
    :returns: A Counter object with category counts in the file.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Filter records that contain text and limit them to 1000 records
    data = [item for item in data if item.get("text")]
    data = data[:1000]

    # Extract categories from the data, ignoring empty or missing ones
    categories = [item["category"] for item in data if item.get("category")]
    return Counter(categories)

In [4]:
def plot_grid(file_stats: Dict[str, Counter], cols: int = 3) -> None:
    """
    Plots a grid of charts to display category statistics for each file.

    :param file_stats: A dictionary where keys are file names and values are category counters.
    :param cols: The number of columns in the grid. Defaults to 3.
    """
    if cols < 1:
        raise ValueError("The number of columns must be at least 1.")
    
    num_files = len(file_stats)
    rows = (num_files + cols - 1) // cols  # Calculate the number of rows in the grid

    # Create a figure with subplots
    fig, axes = plt.subplots(rows, cols, figsize=(20, 5 * rows))
    axes = axes.flatten()  # Flatten the axes array for easier iteration

    # Create a bar chart for each file
    for i, (json_file, category_counts) in enumerate(file_stats.items()):
        ax = axes[i]
        colors = plt.cm.tab10(np.linspace(0, 1, len(category_counts)))
        ax.bar(category_counts.keys(), category_counts.values(), color=colors, zorder=2)
        ax.set_title(f"Categories in {json_file}")
        ax.set_xlabel("Category")
        ax.set_ylabel("Count")
        ax.tick_params(axis='x', rotation=45)
        ax.grid(True, which='both', linestyle='--', linewidth=0.5, zorder=1)

    # Remove extra subplots if there are more than the number of files
    for i in range(len(file_stats), len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()  # Adjust the layout to prevent overlap
    plt.show()

## Processing JSON Files and Plotting Graphs

In [None]:
# Get a list of all JSON files in the specified directory
json_files = [file for file in os.listdir("data/") if file.endswith(".json")]

# Dictionary to store statistics for each file
file_stats: Dict[str, Counter] = {}
file_stats["all"] = Counter()

# Process each file and add the statistics
for json_file in json_files:
    category_counts = process_file(f"data/{json_file}")
    file_stats[json_file] = category_counts
    file_stats["all"] += category_counts  # Add statistics for all files

# Plot graphs for all files
plot_grid(file_stats, cols=3)