In [2]:
import csv
import json
from datetime import datetime


def format_date(date_str):
    try:
        dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
        return {"$date": dt.strftime("%Y-%m-%dT%H:%M:%SZ")}
    except ValueError:
        try:
            dt = datetime.strptime(date_str, "%Y-%m-%d")
            return {"$date": dt.strftime("%Y-%m-%dT%H:%M:%SZ")}
        except:
            return None

# 1. users.json
users = {}

with open('../users.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        user_id = int(row['user_id'])
        users[user_id] = {
            "_id": str(user_id),
            "username": row['username'],
            "country": row['country'],
            "join_date": format_date(row['join_date']),
            "joined_groups": [],
            "created_posts": []
        }

with open('../group_joins.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        user_id = int(row['user_id'])
        if user_id in users:
            users[user_id]["joined_groups"].append({
                "group_id": int(row['group_id']),
                "joined_at": format_date(row['join_date'])
            })

with open('../user_shares_posts.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        user_id = int(row['user_id'])
        if user_id in users:
            users[user_id]["created_posts"].append(int(row['post_id']))

with open('users.json', 'w', encoding='utf-8') as f:
    json.dump(list(users.values()), f, indent=4, ensure_ascii=False)


# 2. groups.json
groups = {}

with open('../groups.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        group_id = int(row['group_id'])
        groups[group_id] = {
            "_id": str(group_id),
            "name": row['group_name'],
            "posts": [],
            # denormalizace
            "last_activity": {"$date": "1970-01-01T00:00:00Z"}
        }

# pojistka
if 0 not in groups:
    groups[0] = {
        "_id": "0",
        "name": "Main Page (Default)",
        "posts": [],
        "last_activity": {"$date": "2023-01-01T00:00:00Z"}
    }

try:
    with open('../post_in_group.csv', 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_group_id = row['group_id'].strip("[]")
            clean_post_id = row['post_id'].strip("[]")

            if clean_group_id and clean_post_id:
                gid = int(clean_group_id)
                pid = int(clean_post_id)
                if gid in groups:
                    if pid not in groups[gid]["posts"]:
                        groups[gid]["posts"].append(pid)
except FileNotFoundError:
    pass

with open('../group_joins.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        group_id = int(row['group_id'])
        if group_id in groups:
            join_date_iso = format_date(row['join_date'])
            if join_date_iso and join_date_iso["$date"] > groups[group_id]["last_activity"]["$date"]:
                groups[group_id]["last_activity"] = join_date_iso

with open('groups.json', 'w', encoding='utf-8') as f:
    json.dump(list(groups.values()), f, indent=4, ensure_ascii=False)


# 3. posts.json

posts = {}

with open('../posts.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        post_id = int(row['post_id'])
        aid = int(float(row['activity_id'])) if row['activity_id'] else None

        posts[post_id] = {
            "_id": str(post_id),
            "activity": aid,
            "created_at": None
        }

with open('../user_shares_posts.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        post_id = int(row['post_id'])
        if post_id in posts:
            posts[post_id]["created_at"] = format_date(row['created_at'])

valid_posts = [p for p in posts.values() if p["created_at"] is not None]

with open('posts.json', 'w', encoding='utf-8') as f:
    json.dump(valid_posts, f, indent=4, ensure_ascii=False)


# 4. activities.json

activities = []

country_map = {
    "Czech Republic": "cz", "Germany": "de",
    "Poland": "pl", "United States": "us"
}

with open('../activities.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        raw_country = row['country']
        clean_country = country_map.get(raw_country, raw_country)

        activities.append({
            "_id": str(row['activity_id']),
            "region": row['region'],
            "distance_m": float(row['distance_m']),
            "country": clean_country
        })

with open('activities.json', 'w', encoding='utf-8') as f:
    json.dump(activities, f, indent=4, ensure_ascii=False)

In [None]:
import csv
import json
from datetime import datetime


def get_oid_string(int_id):
    return f"{int_id:024x}"

def format_date(date_str):
    """
    Converts a date string into the MongoDB Extended JSON Date format: {"$date": "..."}
    """
    # Try parsing common date formats
    date_formats = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]
    for fmt in date_formats:
        try:
            dt = datetime.strptime(date_str, fmt)
            # Use Z for UTC timezone suffix
            return {"$date": dt.strftime("%Y-%m-%dT%H:%M:%SZ")}
        except ValueError:
            continue
    return None

# 1. users.json
users = {}

with open('users.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        user_id = int(row['user_id'])
        users[user_id] = {
            # FIX: Use $oid for the primary key
            "_id": {"$oid": get_oid_string(user_id)},
            "username": row['username'],
            "country": row['country'],
            "join_date": format_date(row['join_date']),
            "joined_groups": [],
            "created_posts": []
        }

with open('group_joins.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        user_id = int(row['user_id'])
        group_id = int(row['group_id'])
        if user_id in users:
            users[user_id]["joined_groups"].append({
                # FIX: Use $oid for the foreign key reference
                "group_id": {"$oid": get_oid_string(group_id)},
                "joined_at": format_date(row['join_date'])
            })

with open('user_shares_posts.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        user_id = int(row['user_id'])
        post_id = int(row['post_id'])
        if user_id in users:
            # FIX: Use $oid for the foreign key reference
            users[user_id]["created_posts"].append({"$oid": get_oid_string(post_id)})

with open('users.json', 'w', encoding='utf-8') as f:
    json.dump(list(users.values()), f, indent=4, ensure_ascii=False)

# ----------------------------------------------------------------------------------
# 2. groups.json
# ----------------------------------------------------------------------------------
groups = {}

with open('groups.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        group_id = int(row['group_id'])
        groups[group_id] = {
            # FIX: Use $oid for the primary key
            "_id": {"$oid": get_oid_string(group_id)},
            "name": row['group_name'],
            "posts": [],
            "last_activity": {"$date": "1970-01-01T00:00:00Z"}
        }

# pojistka (Default Group)
if 0 not in groups:
    # Use a specific, unique OID for a hardcoded default document
    groups[0] = {
        "_id": {"$oid": "000000000000000000000000"}, # A standard placeholder OID
        "name": "Main Page (Default)",
        "posts": [],
        "last_activity": {"$date": "2023-01-01T00:00:00Z"}
    }

try:
    with open('post_in_group.csv', 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_group_id = row['group_id'].strip("[]")
            clean_post_id = row['post_id'].strip("[]")

            if clean_group_id and clean_post_id:
                gid = int(clean_group_id)
                pid = int(clean_post_id)
                if gid in groups:
                    # FIX: Use $oid for the foreign key reference
                    post_oid = {"$oid": get_oid_string(pid)}
                    if post_oid not in groups[gid]["posts"]:
                        groups[gid]["posts"].append(post_oid)
except FileNotFoundError:
    pass

# Update last_activity based on group joins
with open('group_joins.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        group_id = int(row['group_id'])
        if group_id in groups:
            join_date_iso = format_date(row['join_date'])
            if join_date_iso and join_date_iso["$date"] > groups[group_id]["last_activity"]["$date"]:
                groups[group_id]["last_activity"] = join_date_iso

with open('groups.json', 'w', encoding='utf-8') as f:
    json.dump(list(groups.values()), f, indent=4, ensure_ascii=False)

# ----------------------------------------------------------------------------------
# 3. posts.json
# ----------------------------------------------------------------------------------
posts = {}

with open('posts.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        post_id = int(row['post_id'])
        # Ensure activity_id is treated as an integer before OID conversion
        aid = int(float(row['activity_id'])) if row['activity_id'] else None

        # Determine the activity reference structure
        activity_ref = {"$oid": get_oid_string(aid)} if aid is not None else None

        posts[post_id] = {
            # FIX: Use $oid for the primary key
            "_id": {"$oid": get_oid_string(post_id)},
            # FIX: Use $oid for the foreign key reference
            "activity": activity_ref,
            "created_at": None
        }

with open('user_shares_posts.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        post_id = int(row['post_id'])
        if post_id in posts:
            # The 'created_at' date structure is already correct from format_date
            posts[post_id]["created_at"] = format_date(row['created_at'])

# Filter out posts without a 'created_at' date
valid_posts = [p for p in posts.values() if p["created_at"] is not None]

with open('posts.json', 'w', encoding='utf-8') as f:
    json.dump(valid_posts, f, indent=4, ensure_ascii=False)

# ----------------------------------------------------------------------------------
# 4. activities.json
# ----------------------------------------------------------------------------------
activities = []

country_map = {
    "Czech Republic": "cz", "Germany": "de",
    "Poland": "pl", "United States": "us"
}

with open('activities.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        activity_id = int(row['activity_id'])
        raw_country = row['country']
        clean_country = country_map.get(raw_country, raw_country)

        activities.append({
            # FIX: Use $oid for the primary key
            "_id": {"$oid": get_oid_string(activity_id)},
            "region": row['region'],
            "distance_m": float(row['distance_m']),
            "country": clean_country
        })

with open('activities.json', 'w', encoding='utf-8') as f:
    json.dump(activities, f, indent=4, ensure_ascii=False)

FileNotFoundError: [Errno 2] No such file or directory: '../users.csv'