In [34]:
import pandas as pd
import json
import random
from pathlib import Path
from collections import defaultdict

In [35]:
df = pd.read_csv('./data/CAvideos_cc50_202101.csv')

In [36]:
df['state'].value_counts()

state
Quebec                       3247
Alberta                      3209
British Columbia             3196
Yukon                        3159
Ontario                      3156
Prince Edward Island         3141
Northwest Territories        3140
Newfoundland And Labrador    3136
New Brunswick                3131
Nunavut                      3123
Manitoba                     3123
Nova Scotia                  3113
Saskatchewan                 3007
Name: count, dtype: int64

In [37]:
df['category_id'].value_counts()

category_id
24    13451
25     4159
22     4105
23     3773
10     3731
17     2787
1      2060
26     2007
20     1344
28     1155
27      991
19      392
15      369
2       353
43      124
29       74
30        6
Name: count, dtype: int64

In [38]:
df = df[~df['category_id'].isin([30, 43, 29])]
df["category_id"].value_counts()

category_id
24    13451
25     4159
22     4105
23     3773
10     3731
17     2787
1      2060
26     2007
20     1344
28     1155
27      991
19      392
15      369
2       353
Name: count, dtype: int64

In [39]:
category_groups = defaultdict(list)
for idx, row in df.iterrows():
    category = row["category_id"]
    category_groups[category].append((idx, row))

# Reducir tamaño: seleccionar 1/10 por categoría
def reduce_group(group, scale=10):
    if len(group) <= scale:
        return group
    return random.sample(group, len(group) // scale)

# Construir estructura por categoría
graph_by_category = {}

for category, group in category_groups.items():
    reduced_group = reduce_group(group)
    
    nodes = []
    links = []
    id_map = {}

    for i, (idx, row) in enumerate(reduced_group):
        node_id = f"{category}_{i + 1}"
        id_map[idx] = node_id
        nodes.append({
            "id": node_id,
            "name": row["title"],
            "category_id": category
        })

    # Conectar primer nodo con todos los demás
    if len(nodes) > 1:
        for i in range(1, len(nodes)):
            links.append({
                "source": nodes[0]["id"],
                "target": nodes[i]["id"]
            })

    graph_by_category[str(category)] = {
        "nodes": nodes,
        "links": links
    }

In [40]:
with open("../web/public/categories_graph.json", "w", encoding="utf-8") as f:
    json.dump(graph_by_category, f, indent=4, ensure_ascii=False)