Intent, Division and Relevance Categorization

In [None]:
import pandas as pd
import re
from tqdm import tqdm

# =========================
# CONFIG
# =========================
INPUT_CSV   = r"C:\Users\User\Desktop\Datathon\spam_final_scaled.csv"
OUTPUT_ALL  = r"C:\Users\User\Desktop\Datathon\master_enriched.csv"
OUTPUT_CLEAN= r"C:\Users\User\Desktop\Datathon\analysis_ready.csv"  # non-spam subset

tqdm.pandas()

# =========================
# LOAD
# =========================
df = pd.read_csv(INPUT_CSV)

# Safety: ensure required cols exist
if "textOriginal" not in df.columns:
    raise ValueError("Missing column: textOriginal")
if "final_is_spam" not in df.columns:
    raise ValueError("Missing column: final_is_spam (expected 0/1)")
# spam_prob_A is already there; we won't recalc it.

# =========================
# KEYWORDS
# =========================
# Brands (L'Oréal + popular beauty)
BRAND_TERMS = {
    # L'Oréal Group
    "loreal", "maybelline", "nyx", "lancome", "garnier", "kiehls", "shu uemura",
    "yves saint laurent", "ysl beauty", "giorgio armani beauty", "valentino beauty",
    "biotherm", "vichy", "cerave", "la roche posay", "it cosmetics", "urban decay",
    # Others
    "estee lauder", "clinique", "mac cosmetics", "m.a.c", "bobbi brown",
    "nars", "fenty", "rare beauty", "huda beauty", "too faced", "tarte",
    "glossier", "charlotte tilbury", "pat mcgrath", "hourglass",
    "dior", "chanel", "gucci beauty", "givenchy beauty", "shiseido"
}

# Topics: skincare, makeup, fragrance, campaign
TOPIC_TERMS = {
    # Skincare
    "serum", "cleanser", "moisturizer", "sunscreen", "spf", "mask", "toner",
    "essence", "eye cream", "lotion", "hydrating", "anti-aging", "retinol",
    "hyaluronic", "collagen", "brightening", "exfoliant", "scrub", "acne",
    "blemish", "whitening", "skin barrier", "pore", "oil control",
    "facial wash", "night cream", "day cream",
    # Makeup
    "foundation", "concealer", "primer", "powder", "blush", "bronzer",
    "highlighter", "eyeliner", "eyeshadow", "palette", "lipstick", "lip gloss",
    "lip liner", "lip balm", "mascara", "brow pencil", "brow gel",
    "makeup remover", "setting spray", "compact", "cushion foundation",
    "bb cream", "cc cream", "contour",
    # Fragrance
    "perfume", "fragrance", "cologne", "eau de parfum", "eau de toilette",
    "parfum", "body mist", "scent",
    # Campaign/launch
    "new launch", "limited edition", "collection", "promo", "campaign"
}

# Exclusion: generic compliments (to reduce noise)
EXCLUDE_TERMS = {
    "nice", "wow", "cool", "beautiful", "love it", "awesome", "great", "amazing",
    "so good", "so nice", "so cool", "so beautiful"
}

# Divisions → keyword lists
DIVISION_TERMS = {
    "Skincare and skin health": [
        "serum", "cleanser", "moisturizer", "sunscreen", "spf", "mask",
        "toner", "essence", "eye cream", "lotion", "hydrating",
        "anti-aging", "retinol", "hyaluronic", "collagen", "brightening",
        "acne", "blemish", "whitening", "skin barrier", "pore", "oil control",
        "exfoliant", "scrub", "facial wash", "night cream", "day cream"
    ],
    "Makeup and cosmetics": [
        "foundation", "concealer", "primer", "powder", "blush", "bronzer",
        "highlighter", "eyeliner", "eyeshadow", "palette", "lipstick",
        "lip gloss", "lip liner", "lip balm", "mascara", "brow pencil",
        "brow gel", "makeup remover", "setting spray", "compact",
        "cushion foundation", "bb cream", "cc cream", "contour"
    ],
    "Haircare, hair styling, and hair color": [
        "shampoo", "conditioner", "hair mask", "hair serum", "hair oil",
        "hair spray", "hair gel", "pomade", "hair wax", "leave-in",
        "hair treatment", "anti-frizz", "hair loss", "dandruff",
        "scalp care", "split ends", "hair growth", "hair dye", "hair color",
        "bleach", "highlight", "balayage", "ombre", "keratin"
    ],
    "Fragrance and perfumes": [
        "perfume", "fragrance", "cologne", "eau de parfum", "eau de toilette",
        "parfum", "body mist", "scent", "smell", "aroma",
        "fruity", "floral", "woody", "musk", "amber", "citrus",
        "fresh scent", "long lasting", "signature scent", "unisex fragrance"
    ],
}

# Customer intents
INTENT_TERMS = {
    "Question about a product or where to buy": [
        "where can i buy", "is it available", "how much", "price", "cost",
        "available in", "shipping", "delivery", "buy", "purchase", "restock",
        "out of stock", "when will it be", "launch date", "release date",
        "availability", "store", "link please", "any recommendation", "how to use",
        "can i use", "is this good for", "what shade", "which color", "which shade"
    ],
    "Positive feedback about the brand or a product": [
        "love this", "amazing", "great product", "works well", "highly recommend",
        "best ever", "so good", "awesome", "excellent", "worth it",
        "my favorite", "game changer", "must have", "holy grail", "good quality",
        "long lasting", "smells great", "looks good", "perfect match"
    ],
    "Negative feedback or complaint": [
        "waste of money", "didn't work", "did not work", "bad quality",
        "disappointed", "too expensive", "allergic reaction", "not worth it",
        "worst product", "doesn’t last", "doesn't last", "poor service",
        "shipping issue", "broke me out", "damaged", "smells bad", "fake",
        "hate this", "never again", "return", "refund", "irritation"
    ]
}

# =========================
# HELPERS
# =========================
def alpha_token_count(s: str) -> int:
    """Count alphabetic tokens (A–Z) to filter out 'wow', 'nice' noise."""
    if not isinstance(s, str): return 0
    return len(re.findall(r"[a-zA-Z]+", s))

def contains_any(text_lower: str, terms) -> bool:
    """Simple substring match for phrases/words (case-insensitive)."""
    return any(term in text_lower for term in terms)

def keyword_hits(text_lower: str, terms_list) -> int:
    """Count how many keywords appear (for division scoring)."""
    return sum(1 for term in terms_list if term in text_lower)

# =========================
# 1) Relevance (binary)
# =========================
def compute_is_relevant(t: str, min_alpha_tokens: int = 3) -> int:
    if not isinstance(t, str):
        return 0
    tl = t.lower()

    # Require enough alphabetic tokens
    if alpha_token_count(tl) < min_alpha_tokens:
        return 0

    # Exclude generic compliments (reduce false positives)
    if contains_any(tl, EXCLUDE_TERMS):
        return 0

    # At least one brand/topic keyword
    if contains_any(tl, BRAND_TERMS | TOPIC_TERMS):
        return 1

    return 0

# Progress bar
df["is_relevant"] = df["textOriginal"].progress_apply(compute_is_relevant)

# =========================
# 2) Division tagging
#    Pick the division with the most hits; fallback "Other"
# =========================
def assign_division(t: str) -> str:
    if not isinstance(t, str):
        return "Other"
    tl = t.lower()
    best_div, best_hits = "Other", 0
    for div, kw_list in DIVISION_TERMS.items():
        hits = keyword_hits(tl, kw_list)
        if hits > best_hits:
            best_div, best_hits = div, hits
    return best_div

df["division"] = df["textOriginal"].progress_apply(assign_division)

# =========================
# 3) Intent tagging
#    Priority: Question > Negative > Positive > Other
# =========================
QUESTION_TERMS = set(INTENT_TERMS["Question about a product or where to buy"])
POS_TERMS      = set(INTENT_TERMS["Positive feedback about the brand or a product"])
NEG_TERMS      = set(INTENT_TERMS["Negative feedback or complaint"])

def assign_intent(t: str) -> str:
    if not isinstance(t, str):
        return "Other"
    tl = t.lower()

    # quick question signal
    if "?" in tl or contains_any(tl, QUESTION_TERMS):
        return "Question about a product or where to buy"
    if contains_any(tl, NEG_TERMS):
        return "Negative feedback or complaint"
    if contains_any(tl, POS_TERMS):
        return "Positive feedback about the brand or a product"
    return "Other"

df["intent"] = df["textOriginal"].progress_apply(assign_intent)

# =========================
# Save outputs
# =========================
# Master enriched (keep spam for reference)
df.to_csv(OUTPUT_ALL, index=False, encoding="utf-8-sig")

# Analysis-ready (non-spam subset)
df_clean = df[df["final_is_spam"] == 0].copy()
df_clean.to_csv(OUTPUT_CLEAN, index=False, encoding="utf-8-sig")

print("✅ All done!")
print(f"Saved master: {OUTPUT_ALL}")
print(f"Saved analysis-ready (non-spam): {OUTPUT_CLEAN}")

# Quick sanity prints
print("\nRelevance head:")
print(df[["is_relevant", "division", "intent"]].head())

print("\nCounts:")
print("Relevance rate:", df["is_relevant"].mean().round(3))
print("Division breakdown (top 5):")
print(df["division"].value_counts().head(5))
print("Intent breakdown:")
print(df["intent"].value_counts())


100%|███████████████████████████████████████████████████████████████████████| 623223/623223 [00:19<00:00, 32757.60it/s]
100%|███████████████████████████████████████████████████████████████████████| 623223/623223 [00:12<00:00, 48329.14it/s]
100%|███████████████████████████████████████████████████████████████████████| 623223/623223 [00:12<00:00, 51238.36it/s]


✅ All done!
Saved master: C:\Users\User\Desktop\Datathon\master_enriched.csv
Saved analysis-ready (non-spam): C:\Users\User\Desktop\Datathon\analysis_ready.csv

Relevance head:
   is_relevant division intent
0            0    Other  Other
1            0    Other  Other
2            0    Other  Other
3            0    Other  Other
4            0    Other  Other

Counts:
Relevance rate: 0.035
Division breakdown (top 5):
division
Other                                     595297
Makeup and cosmetics                       15507
Skincare and skin health                    6216
Haircare, hair styling, and hair color      4041
Fragrance and perfumes                      2162
Name: count, dtype: int64
Intent breakdown:
intent
Other                                             597968
Positive feedback about the brand or a product     15121
Question about a product or where to buy            5321
Negative feedback or complaint                      4813
Name: count, dtype: int64


Sentiment Analysis

In [None]:
# =========================
# Sentiment for ALL rows (spam + non-spam)
# =========================
import pandas as pd
from tqdm import tqdm
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# ---- CONFIG ----
INPUT_CSV   = r"C:\Users\User\Desktop\Datathon\master_enriched.csv"
OUTPUT_ALL  = r"C:\Users\User\Desktop\Datathon\master_enriched_sent.csv"   # keeps spam rows, now WITH sentiment
OUTPUT_CLEAN= r"C:\Users\User\Desktop\Datathon\analysis_ready_sent.csv"    # non-spam only

tqdm.pandas()

# ---- Load ----
df = pd.read_csv(INPUT_CSV)

# Safety checks
req_cols = ["textOriginal", "final_is_spam"]
missing = [c for c in req_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# ---- Init VADER ----
try:
    sia = SentimentIntensityAnalyzer()
except LookupError:
    nltk.download("vader_lexicon")
    sia = SentimentIntensityAnalyzer()

# ---- Sentiment function ----
def sentiment_label_and_score(text):
    """
    Returns (sentiment_label, compound_score)
    sentiment_label in {'positive','neutral','negative'}
    """
    if not isinstance(text, str) or not text.strip():
        return (None, None)
    scores = sia.polarity_scores(text)
    c = scores["compound"]  # -1..1
    if c >= 0.05:
        return ("positive", c)
    elif c <= -0.05:
        return ("negative", c)
    else:
        return ("neutral", c)

# ---- Compute for ALL rows (including spam) ----
df[["sentiment","sentiment_score"]] = df["textOriginal"].progress_apply(
    lambda x: pd.Series(sentiment_label_and_score(x))
)

# ---- Save outputs ----
# 1) Full master with sentiment (spam kept)
df.to_csv(OUTPUT_ALL, index=False, encoding="utf-8-sig")

# 2) Analysis-ready: non-spam only
df_clean = df[df["final_is_spam"] == 0].copy()
df_clean.to_csv(OUTPUT_CLEAN, index=False, encoding="utf-8-sig")

# ---- Quick summary ----
print("✅ Sentiment computed for ALL rows (spam + non-spam).")
print(f"Saved master (all rows): {OUTPUT_ALL}")
print(f"Saved analysis-ready (non-spam only): {OUTPUT_CLEAN}")

print("\nCounts (ALL rows):")
print(df["sentiment"].value_counts(dropna=False))

print("\nCounts (non-spam only):")
print(df_clean["sentiment"].value_counts(dropna=False).rename("non_spam_counts"))

print("\nSample (non-spam):")
print(df_clean[["textOriginal","sentiment","sentiment_score"]].head(10))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
100%|████████████████████████████████████████████████████████████████████████| 623223/623223 [03:30<00:00, 2962.28it/s]


✅ Sentiment computed for ALL rows (spam + non-spam).
Saved master (all rows): C:\Users\User\Desktop\Datathon\master_enriched_sent.csv
Saved analysis-ready (non-spam only): C:\Users\User\Desktop\Datathon\analysis_ready_sent.csv

Counts (ALL rows):
sentiment
positive    366790
neutral     176241
negative     80192
Name: count, dtype: int64

Counts (non-spam only):
sentiment
positive    361098
neutral     174326
negative     79479
Name: non_spam_counts, dtype: int64

Sample (non-spam):
                                        textOriginal sentiment  \
0                 please lesbian flag beg would rock  positive   
1                            missed calls mars alien  negative   
2                                               baaa   neutral   
3                look like raven phenomena raven cap  positive   
4                                           american   neutral   
5  red heart red heart red heart red heart red he...   neutral   
6    love videos thank red heart red heart red hea

Video Length, Relevance & Quality

In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

# =========================
# CONFIG
# =========================
INPUT_CSV  = r"C:\Users\User\Desktop\Datathon\master_enriched_sent.csv"
OUTPUT_CSV = r"C:\Users\User\Desktop\Datathon\master_enriched_scored.csv"

LENGTH_CAP = 120       # for norm_len = min(text_len / LENGTH_CAP, 1)
QUALITY_THRESHOLD = 0.5  # final cutoff for is_quality (0..1 scale)

tqdm.pandas()

# =========================
# LOAD
# =========================
df = pd.read_csv(INPUT_CSV)

# Safety checks / minimal fallbacks
if "textOriginal" not in df.columns:
    raise ValueError("Missing column 'textOriginal'.")
if "sentiment_score" not in df.columns:
    raise ValueError("Missing column 'sentiment_score' (run sentiment step first).")
if "final_is_spam" not in df.columns:
    raise ValueError("Missing column 'final_is_spam'.")
if "is_relevant" not in df.columns:
    print("⚠️  'is_relevant' missing; defaulting to 0 (not relevant).")
    df["is_relevant"] = 0

# If you have a cleaned text column, prefer it for length; else use textOriginal
TEXT_COL_FOR_LEN = "text_final" if "text_final" in df.columns else "textOriginal"

# =========================
# LENGTH & NORM_LEN
# =========================
df["text_len"] = (
    df[TEXT_COL_FOR_LEN]
      .fillna("")
      .astype(str)
      .str.len()
)

df["norm_len"] = (df["text_len"] / LENGTH_CAP).clip(0, 1)

# =========================
# SENTIMENT STRENGTH (0..1)
# =========================
df["sentiment_strength"] = df["sentiment_score"].abs().fillna(0)

# Ensure is_relevant is numeric 0/1
df["relevance_weight"] = df["is_relevant"].fillna(0).astype(int)

# =========================
# QUALITY SCORE & FLAG
# =========================
df["quality_score"] = (
    0.5 * df["norm_len"] +
    0.3 * df["relevance_weight"] +
    0.2 * df["sentiment_strength"]
).clip(0, 1)

# Spam can never be quality; also require score >= threshold
df["is_quality"] = (
    (df["final_is_spam"] == 0) &
    (df["quality_score"] >= QUALITY_THRESHOLD)
).astype(int)

# =========================
# VIDEO TYPE from ISO 8601 duration (e.g., 'PT29S', 'PT1M05S')
# =========================
def get_video_type(duration_str: str) -> str:
    """Parses an ISO 8601 duration string like 'PT29S', 'PT1M05S'. <=60s => 'Short' else 'Video'."""
    if not isinstance(duration_str, str) or 'PT' not in duration_str:
        return "Unknown"
    minutes_match = re.search(r'(\d+)M', duration_str)
    seconds_match = re.search(r'(\d+)S', duration_str)
    minutes = int(minutes_match.group(1)) if minutes_match else 0
    seconds = int(seconds_match.group(1)) if seconds_match else 0
    total_seconds = (minutes * 60) + seconds
    return "Short" if total_seconds <= 60 else "Video"

# If your column is named differently, adjust here:
DUR_COL = "contentDuration"
if DUR_COL in df.columns:
    df["video_type"] = df[DUR_COL].progress_apply(get_video_type)
else:
    print(f"⚠️  '{DUR_COL}' column not found; setting video_type='Unknown'.")
    df["video_type"] = "Unknown"

# =========================
# SAVE
# =========================
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

print("✅ Done. Scored + video_type added.")
print(f"Saved → {OUTPUT_CSV}")

# Quick peek
print("\nHead (quality columns):")
print(df[["text_len","norm_len","relevance_weight","sentiment_strength","quality_score","is_quality","video_type"]].head())

print("\nCounts:")
print("Quality rate:", df["is_quality"].mean().round(3))
print("Video type breakdown:\n", df["video_type"].value_counts(dropna=False))


100%|██████████████████████████████████████████████████████████████████████| 623223/623223 [00:02<00:00, 250275.55it/s]


✅ Done. Scored + video_type added.
Saved → C:\Users\User\Desktop\Datathon\master_enriched_scored.csv

Head (quality columns):
   text_len  norm_len  relevance_weight  sentiment_strength  quality_score  \
0        34  0.283333                 0              0.3182       0.205307   
1        23  0.191667                 0              0.2960       0.155033   
2         4  0.033333                 0              0.0000       0.016667   
3        35  0.291667                 0              0.3612       0.218073   
4         8  0.066667                 0              0.0000       0.033333   

   is_quality video_type  
0           0      Short  
1           0      Short  
2           0      Short  
3           0      Video  
4           0      Short  

Counts:
Quality rate: 0.181
Video type breakdown:
 video_type
Short      483927
Video      134245
Unknown      5051
Name: count, dtype: int64


Metrics

In [None]:
import pandas as pd
import numpy as np

# ===== CONFIG =====
INPUT_CSV  = r"C:\Users\User\Desktop\Datathon\master_enriched_scored.csv"
OUTPUT_CSV = r"C:\Users\User\Desktop\Datathon\post_effectiveness_kpis.csv"

# ===== Load =====
df = pd.read_csv(INPUT_CSV)

# --- Required flags for counts ---
for col in ["is_quality", "final_is_spam"]:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# --- Helper to pick a column name from several variants ---
def pick_col(cands):
    for c in cands:
        if c in df.columns:
            return c
    return None

# Keys / Post stats in your schema
VIDEO_COL   = pick_col(["videoId","videoID","video_id","VideoID"])
if VIDEO_COL is None:
    raise ValueError("No video id column found. Expected one of: videoId/videoID/video_id/VideoID")

VIEWS_COL   = pick_col(["viewCount","view_count","views","viewCount.value","statistics.viewCount"])
LIKES_COL   = pick_col(["video_likeCount","like_count","likes","statistics.likeCount"])
COMMENTS_COL= pick_col(["commentCount","comment_count","comments","statistics.commentCount"])
SHARES_COL  = pick_col(["share_count","shares","shareCount"])  # may not exist

# Cast post-level stats to numeric; we will use per-post max
df["_views"]    = pd.to_numeric(df[VIEWS_COL],    errors="coerce") if VIEWS_COL   else np.nan
df["_likes"]    = pd.to_numeric(df[LIKES_COL],    errors="coerce") if LIKES_COL   else 0
df["_comments"] = pd.to_numeric(df[COMMENTS_COL], errors="coerce") if COMMENTS_COL else np.nan
df["_shares"]   = pd.to_numeric(df[SHARES_COL],   errors="coerce") if SHARES_COL  else 0

# --- Simple helpers ---
def safe_mode(s: pd.Series, default="Unknown"):
    s = s.dropna()
    if s.empty: return default
    # value_counts sorts by count desc, then by value; take first
    return s.value_counts().index[0]

def mean_or_nan(s: pd.Series):
    s = pd.to_numeric(s, errors="coerce")
    return float(s.mean()) if not s.empty else np.nan

# ===== Per-post aggregation (one row per video) =====
def agg_post(g: pd.DataFrame) -> pd.Series:
    total_rows   = len(g)                          # number of comment rows for this video
    quality_cnt  = int(g["is_quality"].sum())
    spam_cnt     = int(g["final_is_spam"].sum())

    views   = np.nanmax(g["_views"].values)    if "_views"    in g else np.nan
    likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
    shares  = np.nanmax(g["_shares"].values)   if "_shares"   in g else 0
    cm_meta = np.nanmax(g["_comments"].values) if "_comments" in g else np.nan

    # comments used for engagement: prefer post-level metadata; else fall back to row count
    comments_for_eng = cm_meta if pd.notna(cm_meta) else total_rows

    # KPIs
    qcr = quality_cnt / total_rows if total_rows > 0 else np.nan
    spam_rate = spam_cnt / total_rows if total_rows > 0 else np.nan

    q_per_1k = (quality_cnt / views) * 1000 if (pd.notna(views) and views > 0) else np.nan

    eng_rate_pct = ((likes + (comments_for_eng if pd.notna(comments_for_eng) else 0) + (shares if pd.notna(shares) else 0))
                    / views * 100) if (pd.notna(views) and views > 0) else np.nan

    qcr_x_er = qcr * eng_rate_pct if (pd.notna(qcr) and pd.notna(eng_rate_pct)) else np.nan

    # Post-level summaries you asked to include
    # Categorical -> mode
    division_mode  = safe_mode(g.get("division", pd.Series(dtype=object)), default="Other")
    intent_mode    = safe_mode(g.get("intent", pd.Series(dtype=object)),   default="Other")
    sentiment_mode = safe_mode(g.get("sentiment", pd.Series(dtype=object)),default="neutral")
    video_type     = safe_mode(g.get("video_type", pd.Series(dtype=object)),default="Unknown")

    # Numeric -> mean (over all rows for the post)
    sentiment_score_mean   = mean_or_nan(g.get("sentiment_score",   pd.Series(dtype=float)))
    text_len_mean          = mean_or_nan(g.get("text_len",          pd.Series(dtype=float)))
    norm_len_mean          = mean_or_nan(g.get("norm_len",          pd.Series(dtype=float)))
    sentiment_strength_mean= mean_or_nan(g.get("sentiment_strength",pd.Series(dtype=float)))
    relevance_weight_mean  = mean_or_nan(g.get("relevance_weight",  pd.Series(dtype=float)))
    quality_score_mean     = mean_or_nan(g.get("quality_score",     pd.Series(dtype=float)))
    is_quality_rate        = mean_or_nan(g.get("is_quality",        pd.Series(dtype=float)))  # = average of 0/1

    return pd.Series({
        "views": views,
        "likes": likes,
        "comments_meta": cm_meta,
        "shares": shares,
        "total_comments": total_rows,
        "quality_count": quality_cnt,
        "spam_count": spam_cnt,

        "QCR": qcr,
        "spam_rate": spam_rate,
        "quality_comments_per_1k_views": q_per_1k,
        "engagement_rate_pct": eng_rate_pct,
        "QCR_x_EngagementRate": qcr_x_er,

        # post-level summaries
        "division_mode": division_mode,
        "intent_mode": intent_mode,
        "sentiment_mode": sentiment_mode,
        "video_type": video_type,

        "sentiment_score_mean": sentiment_score_mean,
        "text_len_mean": text_len_mean,
        "norm_len_mean": norm_len_mean,
        "sentiment_strength_mean": sentiment_strength_mean,
        "relevance_weight_mean": relevance_weight_mean,
        "quality_score_mean": quality_score_mean,
        "is_quality_rate": is_quality_rate
    })

per_video = df.groupby(VIDEO_COL, as_index=True).apply(agg_post).reset_index()

# Optional tidy column order
cols = [
    VIDEO_COL, "views","likes","comments_meta","shares","total_comments",
    "quality_count","spam_count","QCR","spam_rate",
    "quality_comments_per_1k_views","engagement_rate_pct","QCR_x_EngagementRate",
    "division_mode","intent_mode","sentiment_mode","video_type",
    "sentiment_score_mean","text_len_mean","norm_len_mean","sentiment_strength_mean",
    "relevance_weight_mean","quality_score_mean","is_quality_rate"
]
per_video = per_video[cols]

# Save
per_video.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print("✅ Saved per-video KPIs:", OUTPUT_CSV)
print(per_video.head())


  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  views   = np.nanmax(g["_views"].values)    if "_views"    in g else np.nan
  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  cm_meta = np.nanmax(g["_comments"].values) if "_comments" in g else np.nan
  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  views   = np.nanmax(g["_views"].values)    if "_views"    in g else np.nan
  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  cm_meta = np.nanmax(g["_comments"].values) if "_comments" in g else np.nan
  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  likes   = np.nanmax(g["_likes"].values)    if "_likes"    in g else 0
  likes   = np.nanmax(g["_likes"].values)   

✅ Saved per-video KPIs: C:\Users\User\Desktop\Datathon\post_effectiveness_kpis.csv
   videoId      views    likes  comments_meta  shares  total_comments  \
0        6     4359.0    139.0            5.0       0               1   
1        8    27827.0   2087.0           26.0       0               3   
2       11  2146535.0  78667.0          481.0       0              54   
3       22     1512.0     20.0            2.0       0               1   
4       25     1473.0     53.0            6.0       0               1   

   quality_count  spam_count       QCR  spam_rate  ...  intent_mode  \
0              0           0  0.000000        0.0  ...        Other   
1              2           0  0.666667        0.0  ...        Other   
2              5           0  0.092593        0.0  ...        Other   
3              1           0  1.000000        0.0  ...        Other   
4              0           0  0.000000        0.0  ...        Other   

   sentiment_mode  video_type sentiment_score_mean 