In [None]:
!pip install sentence-transformers umap-learn hdbscan scikit-learn regex numpy pandas tqdm



In [None]:
import re
import json
from pathlib import Path
from typing import List, Dict, Any
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### setting up **lexicons**

In [None]:
VIOLENCE_WORDS = [r'\bkill(s|ed|ing)?\b', r'\bshoot(s|ed|ing)?\b', r'\bshot\b', r'\bstab(s|bed|bing)?\b',
                   r'\bknife\b', r'\bexplod(e|es|ed|ing)?\b', r'\bbang\b', r'\bgun(s)?\b', r'\bpistol\b',
                   r'\brifle\b', r'\bassassin\a', r'\bslaughter\b','blood','dead','corpse','torture']
GORE_WORDS = ['blood drenched','skin off','skinned','guts','entrails','brain','mangled','third degree burns']
SEX_WORDS = [r'\brape\b', r'\bsexual\b', r'\bnaked\b', r'\bbreast(s)?\b', r'\bintercourse\b']
PROFANITY = [r'\bfuck\b', r'\bshit\b', r'\bmotherfucker\b', r'\bbitch\b', r'\bсука\b', r'\bбляд\b']
DRUG_WORDS = [r'\bdrug(s)?\b', r'\bheroin\b', r'\bcocaine\b', r'\bmarijuana\b', r'\bpill(s)?\b', r'\bweed\b']
CHILD_PATTERN = [r'\bchild\b', r'\bboy\b', r'\bgirl\b', r'\bdaughter\b', r'\bson\b', r'\bteen\b', r'\b\[?[0-9]{1,2}\]?\b']

In [None]:
#compile_regex
VIOLENCE_RE = [re.compile(pat, flags=re.I) for pat in VIOLENCE_WORDS]
SEX_RE = [re.compile(pat, flags=re.I) for pat in SEX_WORDS]
PROF_RE = [re.compile(pat, flags=re.I) for pat in PROFANITY]
DRUG_RE = [re.compile(pat, flags=re.I) for pat in DRUG_WORDS]
CHILD_RE = [re.compile(pat, flags=re.I) for pat in CHILD_PATTERN]

### scene **parser**

In [None]:
def parse_script_to_scenes(txt: str) -> List[Dict[str,Any]]:
    scenes = []
    #split_by_typical_scene_headings_words
    parts = re.split(r'(?=(?:INT\.|EXT\.|SCENE HEADING:|scene_heading:))', txt, flags=re.I)
    idx = 0
    for p in parts:
        text = p.strip()
        if not text:
            continue
        heading_match = re.match(r'((?:INT\.|EXT\.).{0,120})', text, flags=re.I)
        heading = heading_match.group(1).strip() if heading_match else f"sc_{idx}"
        scenes.append({'scene_id': idx, 'heading': heading, 'text': text})
        idx += 1
    return scenes

### feature **extraction** per **scene**

In [None]:
def count_matches(regex_list, text):
    s=0
    for rx in regex_list:
        matches = rx.findall(text)
        if matches:
            s += len(matches)
    return s

def scene_feature_vector(scene_text: str) -> Dict[str, float]:
    txt = scene_text.lower()
    v_viol = count_matches(VIOLENCE_RE, txt)
    v_gore = sum(1 for pat in GORE_WORDS if pat in txt)
    v_sex = count_matches(SEX_RE, txt)
    v_prof = count_matches(PROF_RE, txt)
    v_drug = count_matches(DRUG_RE, txt)
    v_child = count_matches(CHILD_RE, txt)
    length = len(txt.split())
    return {
        'violence_count': v_viol,
        'gore_count': v_gore,
        'sex_count': v_sex,
        'profanity_count': v_prof,
        'drug_count': v_drug,
        'child_mentions': v_child,
        'length': length
    }

### **Embeddings**

In [None]:
MODEL_NAME = "all-MiniLM-L6-v2" #SentenceTransformer (пока пусть будет готовая)
embedder = SentenceTransformer(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def compute_scene_embeddings(scenes: List[Dict[str,Any]]):
    texts = [s['text'] for s in scenes]
    embs = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    return embs