In [5]:


import glob
import json
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
from collections import Counter
from nltk.corpus import stopwords
import networkx as nx
import os
import string

In [None]:

# Setup
G = nx.read_gml('../Proj3/mention_network.gml')
analyzer = SentimentIntensityAnalyzer()
tokenizer = TweetTokenizer() 
vader_lexicon = analyzer.lexicon 

# filters
# 1. stop words
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation) 
punctuation.update(['...', '..', '‚Äô', '‚Äú', '‚Äù'])

# Global counters
global_total_tokens = 0
global_known_tokens = 0
global_unknown_words = Counter()

def extract_body(msg):
    content = msg.get("message", "")
    if isinstance(content, dict):
        content = content.get("body", "")
    return content if isinstance(content, str) else ""

files_dir = "../Proj3/mention_network_chats/"

for node in G.nodes():
    pattern = os.path.join(files_dir, f"{node.capitalize()}_*.json")
    matches = glob.glob(pattern)
    if len(matches) == 0:
        pattern = os.path.join(files_dir, f"{node}_*.json") 
        matches = glob.glob(pattern)
    
    try:
        # Check if file exists
        if not matches:
            continue
            
        with open(matches[0], 'r', encoding='utf-8') as f:
            data = json.load(f)

    except (IndexError, FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {node}: {e}")
        continue

    messages = data.get("comments", [])
    
    # Processer hver besked
    for msg in messages:
        text = extract_body(msg)
        if not text:
            continue
            
        # Tokenizer og lowercase
        tokens = tokenizer.tokenize(text.lower())

        # Filter
        tokens = [t for t in tokens if t not in stop_words and t not in punctuation]
        
        for token in tokens:
            global_total_tokens += 1
            if token in vader_lexicon:
                global_known_tokens += 1
            else:
                # Gem ukendte ord s√• vi kan se, hvad der mangler
                global_unknown_words[token] += 1

# --- Resultat Beregning ---

if global_total_tokens > 0:
    coverage_pct = (global_known_tokens / global_total_tokens) * 100
    print(f"\n--- Result ---")
    print(f"Total VADER coverage: {coverage_pct:.2f}%")
    
    print(f"\nTop 20 words VADER don't understands:")
    for word, count in global_unknown_words.most_common(20):
        print(f"{word}: {count}")
else:
    print("No tokens fund.")

Starter analyse af 497 noder...
Error reading jasontheween: Expecting value: line 1 column 1 (char 0)

--- Result ---
Total VADER coverage: 12.71%

Top 20 words VADER don't understands:
1: 196712
w: 133856
lul: 129763
om: 120985
ww: 112988
2: 112374
üòÇ: 109552
get: 107075
u: 102992
Õè: 99354
emirulove: 95829
üò≠: 84132
itskay: 83558
bro: 76999
go: 76480
game: 74752
wendol: 70126
chat: 69829
Ô∏è: 68614
subscribed: 67967


### Analysis of Lexicon Coverage on Twitch Data
The application of the VADER sentiment analysis tool on the dataset yielded a lexical coverage of only 12.71%. This critically low coverage indicates that nearly 88% of the tokens in the corpus are unrecognized by the model and consequently classified as neutral noise.

Analzing the top 20 words, we can see:

**Domain-Specific Slang and Emotes:** High-frequency tokens such as lul (laughter), w (win/success), and channel-specific emotes like emirulove and itskay are semantic pillars of Twitch communication but are absent from standard lexicons.

**Colloquialisms and Abbreviations:** The prevalence of shorthand such as u (you), bro, and om illustrates a highly informal, conversational register that standard models often fail to parse correctly without normalization.

**Platform Noise:** A significant portion of the "language" consists of non-conversational artifacts, including system messages (subscribed), spam (1, 2), and raw Unicode characters (Õè), which dilute the sentiment signal.

This findings demonstrate that Twitch chat operates with a highly specialized and internal sociolect that is significantly distinct from the standard social media language (e.g., Twitter) that VADER was trained on. Consequently, performing sentiment analysis using off-the-shelf VADER without substantial domain adaptation (lexicon injection) lacks validity, as the majority of sentiment-bearing tokens are being systematically ignored.