In [1]:
import pandas as pd
import json
from pathlib import Path

In [2]:
df = pd.read_csv("data/CAvideos_cc50_202101_cleaned.csv")

In [3]:
category_abbr = {
    "Entertainment": "ETM", "Music": "MSC", "Sports": "SPT", "Comedy": "CMD",
    "News & Politics": "NWP", "Film & Animation": "FLM", "Howto & Style": "HTS",
    "People & Blogs": "PBG", "Gaming": "GMG", "Education": "EDU", "Science & Technology": "SCT",
    "Travel & Events": "TVE", "Pets & Animals": "PTA", "Autos & Vehicles": "AUV",
    "Shows": "SHW", "Nonprofits & Activism": "NPA", "Movies": "MOV", "Others": "OTH"
}

requirements_data = {}

In [None]:
category_counts = (
    df["category"]
    .value_counts(normalize=True)
    .mul(100)
    .round(2)
)

others_mask = category_counts < 2
if others_mask.any():
    others_sum = category_counts[others_mask].sum()
    category_counts = category_counts[~others_mask]
    category_counts["Others"] = round(others_sum, 2)

pie_chart_data = category_counts.reset_index()
pie_chart_data.columns = ["name", "value"]
pie_chart_data["name"] = pie_chart_data["name"].map(category_abbr).fillna("UNK")

requirements_data["topCategoriesByTrend"] = pie_chart_data.to_dict(orient="records")


In [None]:
colors = ["#B89DFB", "#758bcf", "#33C2EA", "#FFC182", "#87db72"]

likes_by_category = df.groupby("category")["likes"].sum().round(2)

most_liked = likes_by_category.sort_values(ascending=False).head(5).reset_index()
most_liked.columns = ["key", "value"]
most_liked["key"] = most_liked["key"].map(category_abbr).fillna("UNK")
most_liked["color"] = [colors[i % len(colors)] for i in range(len(most_liked))]

least_liked = likes_by_category.sort_values(ascending=True).head(5).reset_index()
least_liked.columns = ["key", "value"]
least_liked["key"] = least_liked["key"].map(category_abbr).fillna("UNK")
least_liked["color"] = [colors[i % len(colors)] for i in range(len(least_liked))]

requirements_data["mostLikedCategories"] = most_liked.to_dict(orient="records")
requirements_data["leastLikedCategories"] = least_liked.to_dict(orient="records")

In [None]:
df_ratio = df[df["dislikes"] > 0].copy()


ratio_by_category = df_ratio.groupby("category")["like_ratio"].mean().round(2)

best_ratio = ratio_by_category.sort_values(ascending=False).head(5).reset_index()
best_ratio.columns = ["key", "value"]
best_ratio["key"] = best_ratio["key"].map(category_abbr).fillna("UNK")
best_ratio["color"] = ["#33C2EA" for _ in range(len(best_ratio))]

requirements_data["bestLikeDislikeRatio"] = best_ratio.to_dict(orient="records")


In [None]:
df_views = df[df["comment_count"] > 0].copy()
df_views["views_comments_ratio"] = df_views["views"] / df_views["comment_count"]

ratio_views_comments = df_views.groupby("category")["views_comments_ratio"].mean().round(2)
best_views_comments = ratio_views_comments.sort_values(ascending=False).head(5).reset_index()
best_views_comments.columns = ["key", "value"]
best_views_comments["key"] = best_views_comments["key"].map(category_abbr).fillna("UNK")
best_views_comments["color"] = ["#33C2EA" for _ in range(len(best_views_comments))]

requirements_data["bestViewsCommentsRatio"] = best_views_comments.to_dict(orient="records")

In [None]:
df["trending_date"] = pd.to_datetime(df["trending_date"], errors="coerce")

df = df.set_index("trending_date")

grouped = df.groupby(pd.Grouper(freq="5D")).size().reset_index(name="value")

grouped["date"] = grouped["trending_date"].dt.strftime("%Y-%m-%d")

trend_volume = grouped[["date", "value"]].to_dict(orient="records")

for d in trend_volume:
    d["value"] = float(d["value"])
    d["date"] = str(d["date"])

requirements_data["trendVolumeOverTime"] = trend_volume


In [None]:
channel_counts = df["channel_title"].value_counts()

most_common_channels = channel_counts.head(5).reset_index()
most_common_channels.columns = ["key", "value"]
most_common_channels["color"] = [colors[i % len(colors)] for i in range(len(most_common_channels))]

least_common_channels = channel_counts[channel_counts > 0].tail(5).reset_index()
least_common_channels.columns = ["key", "value"]
least_common_channels["color"] = [colors[i % len(colors)] for i in range(len(least_common_channels))]

requirements_data["frequentChannelsInTrend"] = most_common_channels.to_dict(orient="records")
requirements_data["rareChannelsInTrend"] = least_common_channels.to_dict(orient="records")

In [10]:
output_dir = Path("../web/public")
output_dir.mkdir(parents=True, exist_ok=True)
json_path = output_dir / "requirements.json"

if json_path.exists():
    with open(json_path, "r", encoding="utf-8") as f:
        existing_data = json.load(f)
else:
    existing_data = {}

existing_data.update(requirements_data)

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(existing_data, f, indent=2)

print("Archivo requirements.json actualizado con todos los requerimientos.")


Archivo requirements.json actualizado con todos los requerimientos.


In [11]:
# 7
geo_df = df.groupby('state').agg({
    'views': 'sum',
    'likes': 'sum',
    'dislikes': 'sum',
    'lat': 'first',
    'lon': 'first'
}).reset_index()

geo_data = geo_df.to_dict(orient='records')

output_path = Path("../web/public/geo_data.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(geo_data, f, indent=2)