In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import warnings
warnings.filterwarnings("ignore")

In [2]:
file_path = "../data/electrify__applied_ai_engineer__training_data.csv"
df = pd.read_csv(file_path) # 

In [3]:
def tokenize_no_stop(text):
    tokens = re.findall(r"\b\w+\b", str(text).lower())
    return [w for w in tokens if (w not in ENGLISH_STOP_WORDS) & (len(w) >= 2) & (not(w.isdigit()))]

In [4]:
df['tokens'] = df['title'].apply(tokenize_no_stop)
rez = {}
for channel_id, group in df.groupby('channel_id'):
    threshold = group['views_in_period'].quantile(0.75)
    high_group = group[group['views_in_period'] >= threshold]
    low_group = group[group['views_in_period'] < threshold]
    high_counts = Counter()
    for tokens in high_group['tokens']:
        high_counts.update(set(tokens))
    low_counts = Counter()
    for tokens in low_group['tokens']:
        low_counts.update(set(tokens))
    n_high = len(high_group)
    n_low = len(low_group)
    keyword_scores = {}
    for token in set(high_counts.keys()).union(low_counts.keys()):
        high_freq = high_counts[token] / n_high if n_high else 0
        low_freq = low_counts[token] / n_low if n_low else 0
        score = (high_freq + 1e-5) / (low_freq + 1e-5)
        keyword_scores[token] = score
    sorted_tokens = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
    top_keywords = [t for t,_ in sorted_tokens[:20]]
    low_keywords = [t for t,_ in sorted_tokens[-20:]]
    rez[channel_id] = {'top_keywords': top_keywords, 'low_keywords': low_keywords}

In [6]:
rez

{'UC-9b7aDP6ZN0coj9-xFnrtw': {'top_keywords': ['voyager',
   'outside',
   'experiments',
   'eyes',
   'journey',
   'reach',
   'shift',
   'impact',
   'detected',
   'deepest',
   'wrong',
   'asteroid',
   'exoplanet',
   'probes',
   'seen',
   'facts',
   'backwards',
   'dinosaurs',
   'discover',
   'experiment'],
  'low_keywords': ['designed',
   'universe',
   'satellites',
   'crash',
   'scientists',
   'theory',
   'star',
   'black',
   'model',
   'big',
   'weird',
   'space',
   'physics',
   'explain',
   'mercury',
   'shouldn',
   'nasa',
   'images',
   'years',
   'moon']},
 'UC510QYlOlKNyhy_zdQxnGYw': {'top_keywords': ['notable',
   'stood',
   'screwups',
   'horrors',
   'man',
   'invade',
   'castle',
   'overpowered',
   'fearsome',
   'panther',
   'strv',
   'lifestyle',
   'stop',
   'saddam',
   'evil',
   'luxury',
   'oversized',
   'forget',
   'lai',
   'flattest'],
  'low_keywords': ['weird',
   'items',
   'shoot',
   'know',
   'don',
   'terrify

In [11]:
tfidf_rez = {}
for channel_id, group in df.groupby("channel_id"):
    threshold = group["views_in_period"].quantile(0.75)
    high_group = group[group["views_in_period"] >= threshold]
    low_group  = group[group["views_in_period"] <  threshold]

    vectoriser = TfidfVectorizer(
        stop_words="english",
        token_pattern=r"\b[a-zA-Z]{2,}\b",
    )
    titles_all = pd.concat([high_group["title"], low_group["title"]])
    vectoriser.fit(titles_all)

    tfidf_high = vectoriser.transform(high_group["title"])
    tfidf_low  = vectoriser.transform(low_group["title"])
    vocab = vectoriser.get_feature_names_out()

    high_avg = tfidf_high.sum(axis=0).A1 / max(len(high_group), 1)
    low_avg  = tfidf_low.sum(axis=0).A1  / max(len(low_group), 1)

    tfidf_scores = {
        token: (high_avg[idx] + 1e-6) / (low_avg[idx] + 1e-6)
        for idx, token in enumerate(vocab)
    }

    sorted_tfidf = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
    tfidf_top = [t for t, _ in sorted_tfidf[:10]]
    tfidf_low = [t for t, _ in sorted_tfidf[-10:]]

    tfidf_rez[channel_id] = {
        "tfidf_top_keywords": tfidf_top,
        "tfidf_low_keywords": tfidf_low,
    }


In [12]:
tfidf_rez

{'UC-9b7aDP6ZN0coj9-xFnrtw': {'tfidf_top_keywords': ['voyager',
   'aliens',
   'discover',
   'detected',
   'edge',
   'deepest',
   'sdo',
   'seen',
   'jupiter',
   'neptune'],
  'tfidf_low_keywords': ['black',
   'big',
   'years',
   'weird',
   'mercury',
   'images',
   'space',
   'nasa',
   'satellites',
   'moon']},
 'UC510QYlOlKNyhy_zdQxnGYw': {'tfidf_top_keywords': ['oversized',
   'best',
   'disappointments',
   'betrayals',
   'unfair',
   'today',
   'assassinations',
   'fearsome',
   'notable',
   'warriors'],
  'tfidf_low_keywords': ['vs',
   'story',
   'terrifying',
   'axis',
   'wwii',
   'north',
   'korean',
   'tanks',
   'things',
   'soldier']},
 'UC_WXkNIJ2ncLIsAk_ltbvjA': {'tfidf_top_keywords': ['apart',
   'remember',
   'extreme',
   'fakes',
   'gets',
   'lazy',
   'destroyed',
   'wants',
   'minifigures',
   'features'],
  'tfidf_low_keywords': ['obsession',
   'talks',
   'set',
   'vs',
   'building',
   'techniques',
   'obsessions',
   'weird',