# Data Processing and Analysis from JSON Files

This script processes JSON files from the specified directory, merges them into a single CSV file, limits the number of records in each category, and visualizes the statistics.

In [None]:
# Install necessary libraries
%pip install pandas matplotlib

## Import Libraries

In [2]:
import os
import csv
import json
import numpy as np
import pandas as pd
from typing import List
import matplotlib.pyplot as plt

## Settings

In [3]:
# Directory with JSON files and path for the output CSV
input_directory = '../data/raw'  # Folder with JSON files
output_file = 'merged_output_all.csv'  # Path to the final CSV file

# CSV headers
header = ["message_id", "sender_id", "text", "date", "channel", "category"]

# Parameters for data processing
category_column = "category"  # Column with categories
max_count_per_category = 1000  # Maximum number of records per category
apply_limit = False  # Apply limit on the number of records per category

## Converting Data from JSON to CSV

In [4]:
def merge_json_to_csv(input_dir: str, output_csv: str, headers: List) -> None:
    """
    Reads all JSON files from a directory, converts them into CSV format, and saves them to the specified file.

    :param input_dir: Path to the directory containing JSON files.
    :param output_csv: Path to the output CSV file.
    :param headers: List of headers for the final CSV file.
    """
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(headers)  # Write headers
        
        for filename in os.listdir(input_dir):
            if filename.endswith('.json'):  # Check file extension
                file_path = os.path.join(input_dir, filename)
                
                try:
                    with open(file_path, 'r', encoding='utf-8') as json_file:
                        data = json.load(json_file)
                        if isinstance(data, list):  # Process only lists
                            for entry in data:
                                row = [entry.get(key, "") for key in headers]
                                csv_writer.writerow(row)
                except json.JSONDecodeError as e:
                    print(f"Error reading {filename}: {e}")
                except Exception as e:
                    print(f"Unknown error processing {filename}: {e}")

# Perform the merging of JSON files
merge_json_to_csv(input_directory, output_file, header)

## Limiting the Number of Records per Category

In [5]:
def limit_records_by_category(input_csv: str, output_csv: str, category_col: str, max_count: int) -> None:
    """
    Limits the number of records in each category and saves the data to a new CSV file.

    :param input_csv: Path to the input CSV file.
    :param output_csv: Path to the output CSV file.
    :param category_col: Name of the column with categories.
    :param max_count: Maximum number of records per category.
    """
    df = pd.read_csv(input_csv)
    if category_col not in df.columns:
        raise ValueError(f"Column '{category_col}' not found in the dataset.")

    capped_df = (
        df.groupby(category_col)
        .apply(lambda group: group.sample(min(len(group), max_count), random_state=42))
        .reset_index(drop=True)
    )
    capped_df.to_csv(output_csv, index=False)

# Limit the number of records per category (if required)
if apply_limit:
    limit_records_by_category(output_file, output_file, category_column, max_count_per_category)

## Creating Graphs

In [None]:
def plot_from_csv(csv_file: str) -> None:
    """
    Creates a bar chart based on data from a CSV file.

    :param csv_file: Path to the CSV file.
    """
    data = pd.read_csv(csv_file)

    if "category" not in data.columns:
        raise ValueError(f"Column 'category' is missing in the provided CSV file.")

    category_counts = data["category"].value_counts()
    colors = plt.cm.tab10(np.linspace(0, 1, len(category_counts)))

    plt.figure(figsize=(10, 6))
    plt.bar(category_counts.index, category_counts.values, color=colors, zorder=2)
    plt.title("Category Statistics")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5, zorder=1)
    plt.tight_layout()
    plt.show()

# Example of using the function to plot the statistics
plot_from_csv(output_file)