In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import warnings
warnings.filterwarnings("ignore")

In [2]:
file_path = "../data/electrify__applied_ai_engineer__training_data.csv"
df = pd.read_csv(file_path) # 

In [3]:
def tokenize_no_stop(text):
    tokens = re.findall(r"\b\w+\b", str(text).lower())
    return [w for w in tokens if (w not in ENGLISH_STOP_WORDS) & (len(w) >= 2) & (not(w.isdigit()))]

In [4]:
df['tokens'] = df['title'].apply(tokenize_no_stop)
rez = {}
for channel_id, group in df.groupby('channel_id'):
    threshold = group['views_in_period'].quantile(0.75)
    high_group = group[group['views_in_period'] >= threshold]
    low_group = group[group['views_in_period'] < threshold]
    high_counts = Counter()
    for tokens in high_group['tokens']:
        high_counts.update(set(tokens))
    low_counts = Counter()
    for tokens in low_group['tokens']:
        low_counts.update(set(tokens))
    n_high = len(high_group)
    n_low = len(low_group)
    keyword_scores = {}
    for token in set(high_counts.keys()).union(low_counts.keys()):
        high_freq = high_counts[token] / n_high if n_high else 0
        low_freq = low_counts[token] / n_low if n_low else 0
        score = (high_freq + 1e-5) / (low_freq + 1e-5)
        keyword_scores[token] = score
    sorted_tokens = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
    top_keywords = [t for t,_ in sorted_tokens[:20]]
    low_keywords = [t for t,_ in sorted_tokens[-20:]]
    rez[channel_id] = {'top_keywords': top_keywords, 'low_keywords': low_keywords}

In [5]:
rez

{'UC-9b7aDP6ZN0coj9-xFnrtw': {'top_keywords': ['voyager',
   'problems',
   'pluto',
   'wrong',
   'outside',
   'asteroid',
   'backwards',
   'exoplanet',
   'detected',
   'affect',
   'coming',
   'seen',
   'experiment',
   'best',
   'deepest',
   'nearly',
   'closest',
   'sdo',
   'proxima',
   'jupiter'],
  'low_keywords': ['model',
   'scientists',
   'designed',
   'looks',
   'supercut',
   'shouldn',
   've',
   'black',
   'satellites',
   'mercury',
   'big',
   'theory',
   'space',
   'weird',
   'crash',
   'physics',
   'nasa',
   'years',
   'images',
   'moon']},
 'UC510QYlOlKNyhy_zdQxnGYw': {'top_keywords': ['communist',
   'best',
   'screwups',
   'sturmtiger',
   'haiti',
   'panther',
   'hussein',
   'assassinations',
   'lai',
   'stop',
   'oversized',
   'horrors',
   'flattest',
   'warriors',
   'lifestyle',
   'disappointments',
   'cursed',
   'hell',
   'castle',
   'evil'],
  'low_keywords': ['crazy',
   'russia',
   'shoot',
   'vietnam',
   'figh

In [6]:
tfidf_rez = {}
for channel_id, group in df.groupby("channel_id"):
    threshold = group["views_in_period"].quantile(0.75)
    high_group = group[group["views_in_period"] >= threshold]
    low_group  = group[group["views_in_period"] <  threshold]

    vectoriser = TfidfVectorizer(
        stop_words="english",
        token_pattern=r"\b[a-zA-Z]{2,}\b",
    )
    titles_all = pd.concat([high_group["title"], low_group["title"]])
    vectoriser.fit(titles_all)

    tfidf_high = vectoriser.transform(high_group["title"])
    tfidf_low  = vectoriser.transform(low_group["title"])
    vocab = vectoriser.get_feature_names_out()

    high_avg = tfidf_high.sum(axis=0).A1 / max(len(high_group), 1)
    low_avg  = tfidf_low.sum(axis=0).A1  / max(len(low_group), 1)

    tfidf_scores = {
        token: (high_avg[idx] + 1e-6) / (low_avg[idx] + 1e-6)
        for idx, token in enumerate(vocab)
    }

    sorted_tfidf = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
    tfidf_top = [t for t, _ in sorted_tfidf[:10]]
    tfidf_low = [t for t, _ in sorted_tfidf[-10:]]

    tfidf_rez[channel_id] = {
        "tfidf_top_keywords": tfidf_top,
        "tfidf_low_keywords": tfidf_low,
    }


In [7]:
tfidf_rez

{'UC-9b7aDP6ZN0coj9-xFnrtw': {'tfidf_top_keywords': ['voyager',
   'aliens',
   'discover',
   'detected',
   'edge',
   'deepest',
   'sdo',
   'seen',
   'jupiter',
   'neptune'],
  'tfidf_low_keywords': ['black',
   'big',
   'years',
   'weird',
   'mercury',
   'images',
   'space',
   'nasa',
   'satellites',
   'moon']},
 'UC510QYlOlKNyhy_zdQxnGYw': {'tfidf_top_keywords': ['oversized',
   'best',
   'disappointments',
   'betrayals',
   'unfair',
   'today',
   'assassinations',
   'fearsome',
   'notable',
   'warriors'],
  'tfidf_low_keywords': ['vs',
   'story',
   'terrifying',
   'axis',
   'wwii',
   'north',
   'korean',
   'tanks',
   'things',
   'soldier']},
 'UC_WXkNIJ2ncLIsAk_ltbvjA': {'tfidf_top_keywords': ['apart',
   'remember',
   'extreme',
   'fakes',
   'gets',
   'lazy',
   'destroyed',
   'wants',
   'minifigures',
   'features'],
  'tfidf_low_keywords': ['obsession',
   'talks',
   'set',
   'vs',
   'building',
   'techniques',
   'obsessions',
   'weird',

In [8]:
clean_titles_temp = df['title'].astype(str).str.strip()

In [9]:
clean_titles_temp

0      Why Does Russia insist on still using Armored ...
1                Absurd Stories from the Afghanistan War
2       Bro does Beer Run for his Buddies in Vietnam War
3      How a Lone US Sub Destroyed Japan's Most Impor...
4                Waffen SS vs. Vietcong - Who Would Win?
                             ...                        
206    The Star Explosion So Powerful, It Compressed ...
207    What Voyager Detected at the Edge of the Solar...
208    What They Didn't Teach You in School about Nep...
209    How Are Ghost Particles Affecting Earth? | Ice...
210    NASA Tried to Screw Astronauts Over Money - an...
Name: title, Length: 211, dtype: object

In [10]:
clean_titles_temp = clean_titles_temp.str.split().str[0]
clean_titles_temp

0         Why
1      Absurd
2         Bro
3         How
4      Waffen
        ...  
206       The
207      What
208      What
209       How
210      NASA
Name: title, Length: 211, dtype: object

In [11]:
clean_titles_temp = clean_titles_temp.dropna().str.lower()
clean_titles_temp

0         why
1      absurd
2         bro
3         how
4      waffen
        ...  
206       the
207      what
208      what
209       how
210      nasa
Name: title, Length: 211, dtype: object

In [12]:
initial_words = clean_titles_temp.copy()
counts = initial_words.value_counts()
total = int(counts.sum())
top = counts.head()
print([{"word": w, "rate": float(c) / total}
       for w, c in top.items()])

[{'word': 'the', 'rate': 0.2132701421800948}, {'word': 'lego', 'rate': 0.061611374407582936}, {'word': 'what', 'rate': 0.05687203791469194}, {'word': 'why', 'rate': 0.037914691943127965}, {'word': 'life', 'rate': 0.03317535545023697}]


In [13]:
for channel_id, group in df.groupby("channel_id"):
    group["colon_rate"] = group["title"].astype(str).str.contains(r":").mean()

In [16]:
high_group

Unnamed: 0,channel_id,video_id,title,summary,views_in_period,tokens
127,UC_WXkNIJ2ncLIsAk_ltbvjA,7kQuVgRR-Qk,When LEGO Gets Lazy...,"This video reviews various LEGO sets, contrast...",3745,"[lego, gets, lazy]"
128,UC_WXkNIJ2ncLIsAk_ltbvjA,XDfDYCi4xBQ,MANY LEGO SETS THAT BREAK THE RULES!,The video highlights a variety of mistakes and...,3370,"[lego, sets, break, rules]"
142,UC_WXkNIJ2ncLIsAk_ltbvjA,gWUgLcA-8ps,200 IQ LEGO MINIFIGURES!,This video explores the most creative and uniq...,18562,"[iq, lego, minifigures]"
143,UC_WXkNIJ2ncLIsAk_ltbvjA,Sxo2j9EugnU,75 LEGO SETS THAT BREAK THE RULES!,The video showcases a wide variety of accident...,15446,"[lego, sets, break, rules]"
145,UC_WXkNIJ2ncLIsAk_ltbvjA,wMm11lzsXM0,LEGO Wants These Bricks Destroyed,The video explores the fascinating and scandal...,7761,"[lego, wants, bricks, destroyed]"
146,UC_WXkNIJ2ncLIsAk_ltbvjA,0-MQEvomCYU,LEGO Sets With Too Many Minifigures,The video counts down the LEGO sets with the h...,67124,"[lego, sets, minifigures]"
148,UC_WXkNIJ2ncLIsAk_ltbvjA,fAnRIzHpxuE,300 IQ LEGO Play Features!,This video explores the most creative and clev...,3601,"[iq, lego, play, features]"
149,UC_WXkNIJ2ncLIsAk_ltbvjA,mBIXXFPgTKI,EXTREME LEGO FAKES!,This video humorously explores the world of fa...,3504,"[extreme, lego, fakes]"
150,UC_WXkNIJ2ncLIsAk_ltbvjA,DW8e0cKUr5U,800 IQ LEGO SETS!,The video explores the most ingenious and unco...,9707,"[iq, lego, sets]"
151,UC_WXkNIJ2ncLIsAk_ltbvjA,Xe21Yh-Gi9Y,THE WORST LEGO SETS,The video reviews some of LEGO's worst sets ev...,42705,"[worst, lego, sets]"


In [17]:
low_group

Unnamed: 0,channel_id,video_id,title,summary,views_in_period,tokens
107,UC_WXkNIJ2ncLIsAk_ltbvjA,h989zfYFk2o,400 IQ LEGO SETS!,The video showcases a variety of Lego sets and...,193,"[iq, lego, sets]"
108,UC_WXkNIJ2ncLIsAk_ltbvjA,omPcR-Rum9U,LEGO Has Gone Too Far...,The video counts down the ten Lego sets with t...,89,"[lego, gone, far]"
109,UC_WXkNIJ2ncLIsAk_ltbvjA,2Y1ZHV3Nfig,Misleading LEGO Bricks...,The video explains that while LEGO produces br...,164,"[misleading, lego, bricks]"
110,UC_WXkNIJ2ncLIsAk_ltbvjA,l8ByqywIy3s,Weird vs Epic LEGO Set Stickers...,"The video explores the history, controversy, c...",158,"[weird, vs, epic, lego, set, stickers]"
111,UC_WXkNIJ2ncLIsAk_ltbvjA,YCDqMf9cMnQ,LOTS OF LEGO SETS THAT BREAK THE RULES!,The video highlights various rare and amusing ...,1446,"[lots, lego, sets, break, rules]"
112,UC_WXkNIJ2ncLIsAk_ltbvjA,LVfbHzKyvdE,0 IQ LEGO SETS...,This video explores some of LEGO’s most unusua...,1012,"[iq, lego, sets]"
113,UC_WXkNIJ2ncLIsAk_ltbvjA,4cjdNZxlOrE,LEGO will NEVER be the same...,This video explores the evolution of LEGO mini...,2002,[lego]
114,UC_WXkNIJ2ncLIsAk_ltbvjA,Q23s9_oKbaU,The Curse of Brittle LEGO Bricks,The video explores three major Lego conspiracy...,538,"[curse, brittle, lego, bricks]"
115,UC_WXkNIJ2ncLIsAk_ltbvjA,mesRxwW_FfA,LEGO MINIFIGURES THAT BREAK THE RULES!,The video highlights various amusing and unusu...,2055,"[lego, minifigures, break, rules]"
116,UC_WXkNIJ2ncLIsAk_ltbvjA,0Oirz0WCXJs,1000 IQ LEGO BUILDING TECHNIQUES!,The video showcases a variety of ingenious and...,1802,"[iq, lego, building, techniques]"
