In [73]:
import networkx as nx
import matplotlib.pyplot as plt
import re
import os
import glob
import requests
import json
from collections import defaultdict

##Part 1




In [74]:
import urllib.request
response = urllib.request.urlopen('http://www.example.com/')
html = response.read()

print(html)

b'<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href="https://iana.org/domains/example">Learn more</a></div></body></html>\n'


In [75]:
import pickle
import csv
from statistics import mean


with open('Cleaned_Rock_Musicians.txt', 'r', encoding='utf-8') as g:
    allArtists_text = g.read()


# Get all txt files in the wiki_pages directory
wiki_files = glob.glob('wiki_pages/*.txt')
print(f"Found {len(wiki_files)} txt files in wiki_pages directory")


def load_labmt_lexicon(filename='Data_Set_S1.txt'):
    """
    Loads the labMT (S1) dataset (tab-separated) and returns {word: happiness_average(float)}.
    Lines with '--' are skipped. Header must include 'word' and 'happiness_average'.
    """
    lex = {}
    with open(filename, encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            w = (row.get('word') or '').strip().lower()
            v = (row.get('happiness_average') or '').strip()
            if not w or v in ('', '--'):
                continue
            try:
                lex[w] = float(v)
            except ValueError:
                pass
    return lex

# precompile a plain-word tokenizer (letters only, case-insensitive)
_TOKEN_RE = re.compile(r"[A-Za-z]+")

# --- 2) Compute average happiness for a page's text ---
def page_happiness_average(text, lexicon):
    """
    Returns (avg_happiness, matched_count, token_count).
    Only words found in lexicon contribute to the average.
    """
    if not text:
        return (None, 0, 0)
    tokens = _TOKEN_RE.findall(text.lower())
    token_count = len(tokens)
    if token_count == 0:
        return (None, 0, 0)
    vals = [lexicon[t] for t in tokens if t in lexicon]
    if not vals:
        return (None, 0, token_count)
    return (mean(vals), len(vals), token_count)


artist_set = set()
for line in allArtists_text.strip().split('\n'):
    artist_name = line.strip()
    if artist_name:  # Skip empty lines
        artist_set.add(artist_name)

# Initialize lists to store all results
lexicon = load_labmt_lexicon("Data_Set_S1.txt")
file_results = {}

G = nx.DiGraph()

# Regular expression to match Wikipedia links
wiki_link_pattern = r'\[\[([^\[\]|]+)(?:\|[^\[\]]+)?\]\]'

url2 = 'https://raw.githubusercontent.com/LucasJuel/socialgraphs2025/main/assignments/Assignment%202/artist_genres_without_rock.json'
response2 = requests.get(url2)
artist_genres = response2.json()


# Process each file
for file_path in wiki_files:
    try:
        #print(f"\nProcessing file: {file_path}")

        artist_name = os.path.basename(file_path).replace('.txt', '')
        artist_name = artist_name.replace('_', ' ').strip()

        if artist_name not in artist_set:
            print(f"Skipping {artist_name} - not in original artist list")
            continue

        print(f"Artist name from filename: {artist_name}")
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Extract all matches
        matches = re.findall(wiki_link_pattern, text)
        
        # Get unique links for this file
        unique_links = list(set(matches))
        
        # Check which links are found in the allArtists_text list
        found_in_artists = []

        # First remove common wiki markup
        clean_text = text
        clean_text = re.sub(r'\[\[.*?\]\]', ' ', clean_text)  # Remove links
        clean_text = re.sub(r'\{\{.*?\}\}', ' ', clean_text)  # Remove templates
        clean_text = re.sub(r'<.*?>', ' ', clean_text)  # Remove HTML tags
        clean_text = re.sub(r'==.*?==', ' ', clean_text)  # Remove headers
        clean_text = re.sub(r'\|.*?\|', ' ', clean_text)  # Remove table markup

        for word in clean_text.split():
            if word.startswith('File:') or word.startswith('Image:'):
                clean_text = clean_text.replace(word, ' ')

        # Count how many times each genre appears in this artist's page (case-insensitive)
        clean_text_lower = text.lower()

        genre_counts = {}
        for genre in artist_genres.get(artist_name.replace('(band)', '').strip(), []):
            # Use regex with word boundaries to match complete genre phrases
            pattern = r'\b' + re.escape(genre.lower()) + r'\b'
            matches = re.findall(pattern, clean_text_lower)
            genre_counts[genre] = len(matches)

        print("Lookey here", genre_counts)
        # Now count actual words (sequences of word characters)
        words = re.findall(r'\b\w+\b', clean_text.lower())

        word_count = len(words)
        avg, total, matched = page_happiness_average(clean_text, lexicon)

        if( artist_name not in G.nodes()):
            G.add_node(artist_name, word_count=word_count, happiness_average=avg, genres=genre_counts)
            #print(f" Is it this one Added node: {artist_name} with word count {word_count} and happiness average {avg} the genre counts are {genre_counts}  ")
        else:
            G.nodes[artist_name]['word_count'] = word_count
            G.nodes[artist_name]['happiness_average'] = avg
            G.nodes[artist_name]['genres'] = genre_counts

        
        for link in unique_links:
            if link not in artist_set:
                continue  # Skip links not containing artists
            link_normalised = link.replace('_', ' ').strip()

            if link_normalised == artist_name:
                continue  # Skip self-loops

            if link_normalised not in G.nodes():
                G.add_node(link_normalised, word_count=0, happiness_average=avg, genres=genre_counts)  # Add node with zero word count if not present
                #print(f"  Added node: {link_normalised} with word count 0 and happiness average {avg}")

            if link_normalised in artist_set:
                G.add_edge(artist_name, link_normalised)
                #print(f"  Found link to artist: {link_normalised}")
                found_in_artists.append(link_normalised)

        
        # Print results for this file
        #print(f"\n{artist_name}:")
        #print(f"  Found edges: {len(found_in_artists)}")


        #with open('rock_network.pkl', 'wb') as f:
         #   pickle.dump(G, f)

    
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

   
    





Found 489 txt files in wiki_pages directory
Artist name from filename: 10cc
Lookey here {'art rock': 3, 'art pop': 3, 'progressive pop': 2, 'soft rock': 2, 'pop rock': 1}
Artist name from filename: 10 Years (band)
Lookey here {'alternative metal': 5, 'progressive metal': 3, 'post-grunge': 3, 'nu&nbsp': 1, 'metal': 14}
Artist name from filename: 311 (band)
Lookey here {'alternative rock': 3, 'rap rock': 3, 'reggae rock': 3, 'funk rock': 3, 'funk metal': 3}
Artist name from filename: 38 Special (band)
Lookey here {'hard rock': 3, 'southern rock': 5, 'boogie rock': 1, 'blues rock': 2}
Artist name from filename: 3 Doors Down
Lookey here {'post-grunge': 3, 'alternative rock': 3}
Artist name from filename: ABBA
Lookey here {'pop music': 3, 'pop': 35, 'disco': 3, 'pop rock': 2, 'europop': 2}
Artist name from filename: Accept (band)
Lookey here {'heavy metal music': 2, 'heavy metal': 9}
Skipping AC DC - not in original artist list
Artist name from filename: Adam Ant
Lookey here {'new wave musi

In [76]:
# Basic network statistics
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Network density: {nx.density(G):.4f}")
# print some names of nodes
print("Some node names:")
print(list(G.nodes())[:10])  # Print the first 10 node names

Number of nodes: 488
Number of edges: 7197
Network density: 0.0303
Some node names:
['10cc', 'The Clash', 'George Harrison', 'Duran Duran', 'Toto (band)', 'Ringo Starr', 'Steely Dan', 'Roxy Music', 'The Yardbirds', 'Frank Zappa']


In [77]:
degrees = [G.degree(n) for n in G.nodes()]
print(f"Average degree: {sum(degrees)/len(degrees):.2f}")
print(f"Max degree: {max(degrees)}")
print(f"Min degree: {min(degrees)}")

Average degree: 29.50
Max degree: 147
Min degree: 0


In [78]:
# Remove nodes with degree 0 (isolated nodes)
print(f"Original graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# Find nodes with degree 0
isolated_nodes = [node for node in G.nodes() if G.degree(node) == 0]
print(f"Found {len(isolated_nodes)} isolated nodes (degree 0)")

if isolated_nodes:
    print(f"Example isolated nodes: {isolated_nodes[:10]}")

# Remove isolated nodes
G.remove_nodes_from(isolated_nodes)

print(f"After removing isolated nodes: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

#Alternative: Create a new graph without isolated nodes
G_connected = G.copy()
G_connected.remove_nodes_from([node for node in G_connected.nodes() if G_connected.degree(node) == 0])

# Update the saved network
with open('rock_artists_network.pkl', 'wb') as f:
    pickle.dump(G_connected, f)
    
print("Updated network saved (without isolated nodes)")

Original graph: 488 nodes, 7197 edges
Found 4 isolated nodes (degree 0)
Example isolated nodes: ['Dr. Hook & the Medicine Show', 'Jet (Australian band)', "The B-52's", 'Van Zant (band)']
After removing isolated nodes: 484 nodes, 7197 edges
Updated network saved (without isolated nodes)


In [79]:

# Extract the largest weakly connected component
weakly_connected_components = list(nx.weakly_connected_components(G))
print(f"Number of weakly connected components: {len(weakly_connected_components)}")

# Find the largest component (by number of nodes)
component_sizes = [len(component) for component in weakly_connected_components]
print(f"Component sizes: {sorted(component_sizes, reverse=True)}")

# Get the largest component
largest_component_nodes = max(weakly_connected_components, key=len)
print(f"Largest component has {len(largest_component_nodes)} nodes")

# Create a subgraph containing only the largest component
largest_component = G.subgraph(largest_component_nodes).copy()

print(f"\nLargest component statistics:")
print(f"Number of nodes: {largest_component.number_of_nodes()}")
print(f"Number of edges: {largest_component.number_of_edges()}")
print(f"Network density: {nx.density(largest_component):.4f}")

# Show some example nodes from the largest component
print(f"\nSample nodes from largest component: {list(largest_component_nodes)[:10]}")

# Save the largest component as a separate graph
with open('largest_component.pkl', 'wb') as f:
    pickle.dump(largest_component, f)
    
print("\nLargest component saved as 'largest_component.pkl'")

Number of weakly connected components: 1
Component sizes: [484]
Largest component has 484 nodes

Largest component statistics:
Number of nodes: 484
Number of edges: 7197
Network density: 0.0308

Sample nodes from largest component: ['10cc', 'Journey (band)', 'New York Dolls', 'George Harrison', 'AC/DC', 'Duran Duran', 'Toto (band)', 'Third Eye Blind', 'Alice Cooper (band)', 'The Used']

Largest component saved as 'largest_component.pkl'
