In [8]:
# Loading necessary libraries: 

import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import time
import json
import re
from collections import Counter
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter
from PIL import Image
from io import BytesIO



In [9]:
# Function to get the title and first paragraph: 
def get_title_and_first_paragraph(page_url): 
    try:
        response = requests.get(page_url)
        response.raise_for_status()  
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.find('h1', {'id': 'firstHeading'}).text.strip()

        content_div = soup.find('div', {'class': 'mw-parser-output'})
        if content_div:
            paragraphs = content_div.find_all('p')
            if paragraphs:
                first_paragraph = paragraphs[0].text.strip()
            else:
                first_paragraph = None
        else:
            first_paragraph = None

        return title, first_paragraph
    except Exception as e:
        print(f"Error processing {page_url}: {e}")
        return None, None

# Function to scrape articles from a category:
def scrape_category(category_name, category_url, num_articles=50):
    articles = []
    subcategories = set()
    
    try:
        response = requests.get(category_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        links = soup.find_all('a', href=True)
        
        article_links = [
            link['href'] for link in links
            if link['href'].startswith('/wiki/')
            and not any(link['href'].startswith(prefix) for prefix in [
                '/wiki/Category:', '/wiki/Special:', '/wiki/Help:', '/wiki/Talk:', '/wiki/File:'
            ])
        ]
        
        # Filtering subcategories:
        subcategory_links = [link['href'] for link in links if link['href'].startswith('/wiki/Category:')]
        subcategories.update(subcategory_links)
        
        for link in article_links:
            if len(articles) >= num_articles:
                break
            page_url = f'https://en.wikipedia.org{link}'
            title, first_paragraph = get_title_and_first_paragraph(page_url)
            
            if title and first_paragraph:
                articles.append({
                    'Topic': category_name,
                    'Page_Title': title,
                    'First_Paragraph': first_paragraph
                })
            
            time.sleep(1)

        # Trying subcategories: 
        if len(articles) < num_articles:
            print(f"Fetching articles from subcategories for {category_name}...")
            for subcategory in subcategories:
                if len(articles) >= num_articles:
                    break
                subcategory_url = f'https://en.wikipedia.org{subcategory}'
                articles.extend(scrape_category_from_subcategory(category_name, subcategory_url, num_articles - len(articles)))
        
        print(f"Scraped {len(articles)} articles for {category_name}")
        return articles
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching category page {category_url}: {e}")
        return []

# Function to scrape articles from subcategories: 
def scrape_category_from_subcategory(category_name, subcategory_url, num_articles):
    articles = []
    try:
        response = requests.get(subcategory_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        links = soup.find_all('a', href=True)
        article_links = [
            link['href'] for link in links
            if link['href'].startswith('/wiki/')
            and not any(link['href'].startswith(prefix) for prefix in [
                '/wiki/Category:', '/wiki/Special:', '/wiki/Help:', '/wiki/Talk:', '/wiki/File:'
            ])
        ]
        
        for link in article_links:
            if len(articles) >= num_articles:
                break
            page_url = f'https://en.wikipedia.org{link}'
            title, first_paragraph = get_title_and_first_paragraph(page_url)
            
            if title and first_paragraph:
                articles.append({
                    'Topic': category_name,
                    'Page_Title': title,
                    'First_Paragraph': first_paragraph
                })
            
            time.sleep(1)

        return articles
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching subcategory page {subcategory_url}: {e}")
        return []

# Main scraping function: 
def scrape_wikipedia():
    category_urls = {
        'History': 'https://en.wikipedia.org/wiki/Category:History',
        'Sports': 'https://en.wikipedia.org/wiki/Category:Sports',
        'Technology': 'https://en.wikipedia.org/wiki/Category:Technology',
        'Politics': 'https://en.wikipedia.org/wiki/Category:Politics'
    }

    all_data = []
    
    for topic, url in category_urls.items():
        print(f"Scraping articles for {topic}...")
        articles = scrape_category(topic, url)
        all_data.extend(articles)
    
    df = pd.DataFrame(all_data)
    print(df.head())
    
    return df

df = scrape_wikipedia()


Scraping articles for History...
Fetching articles from subcategories for History...
Scraped 50 articles for History
Scraping articles for Sports...
Fetching articles from subcategories for Sports...
Scraped 50 articles for Sports
Scraping articles for Technology...
Fetching articles from subcategories for Technology...
Scraped 50 articles for Technology
Scraping articles for Politics...
Scraped 50 articles for Politics
     Topic                          Page_Title  \
0  History                           Main Page   
1  History        Wikipedia:File upload wizard   
2  History                           Main Page   
3  History               Category talk:History   
4  History  Library of Congress Classification   

                                     First_Paragraph  
0  Margaret Sibella Brown (1866–1961) was a Canad...  
1  Thank you for offering to contribute an image ...  
2  Margaret Sibella Brown (1866–1961) was a Canad...  
3  A quick way to see the subcategories and artic...  


In [10]:
df

Unnamed: 0,Topic,Page_Title,First_Paragraph
0,History,Main Page,Margaret Sibella Brown (1866–1961) was a Canad...
1,History,Wikipedia:File upload wizard,Thank you for offering to contribute an image ...
2,History,Main Page,Margaret Sibella Brown (1866–1961) was a Canad...
3,History,Category talk:History,A quick way to see the subcategories and artic...
4,History,Library of Congress Classification,The Library of Congress Classification (LCC) i...
...,...,...,...
195,Politics,Political posturing,"Political posturing, also known as political g..."
196,Politics,Outline of political science,The following outline is provided as an overvi...
197,Politics,Politically exposed person,"In financial regulation, a politically exposed..."
198,Politics,Politicisation,Politicisation (also politicization; see Engli...


In [11]:
df.to_json('/Users/samreade/Desktop/COMM 188C/wikipedia_data.json', orient='records', lines=True)

In [26]:

def simple_tokenizer(text):
    tokens = re.findall(r'\b\w+\b', text.lower()) 
    return tokens

df['Title_Tokens'] = df['Page_Title'].apply(simple_tokenizer)
df['Paragraph_Tokens'] = df['First_Paragraph'].apply(simple_tokenizer)

print(df[['Topic', 'Page_Title', 'Title_Tokens', 'Paragraph_Tokens']].head())


     Topic                          Page_Title  \
0  History                           Main Page   
1  History        Wikipedia:File upload wizard   
2  History                           Main Page   
3  History               Category talk:History   
4  History  Library of Congress Classification   

                              Title_Tokens  \
0                             [main, page]   
1        [wikipedia, file, upload, wizard]   
2                             [main, page]   
3                [category, talk, history]   
4  [library, of, congress, classification]   

                                    Paragraph_Tokens  
0  [margaret, sibella, brown, 1866, 1961, was, a,...  
1  [thank, you, for, offering, to, contribute, an...  
2  [margaret, sibella, brown, 1866, 1961, was, a,...  
3  [a, quick, way, to, see, the, subcategories, a...  
4  [the, library, of, congress, classification, l...  


In [27]:

# Defining stopwords: 
stopwords = set([
    'the', 'and', 'for', 'with', 'a', 'an', 'in', 'of', 'on', 'to', 'by', 'is', 'are',
    'was', 'were', 'be', 'this', 'that', 'it', 'as', 'at', 'from', 'or', 'but', 'if',
    'then', 'so', 'than', 'such', 'its', 'into', 'out', 'up', 'down', 'about', 'after',
    'before', 'over', 'under', 'again', 'against', 'between', 'during', 'without', 'within',
    'has', 'have', 'had', 'do', 'does', 'did', 'can', 'could', 'would', 'should', 'may', 'might'
])

# Cleaning and tokenizing:
def clean_tokenizer(text):
    tokens = re.findall(r'\b\w+\b', text.lower())  
    cleaned = [word for word in tokens if word not in stopwords and len(word) > 2]
    return cleaned

df['Title_Tokens'] = df['Page_Title'].apply(clean_tokenizer)
df['Paragraph_Tokens'] = df['First_Paragraph'].apply(clean_tokenizer)

print(df[['Page_Title', 'Title_Tokens', 'Paragraph_Tokens']].head())


                           Page_Title                         Title_Tokens  \
0                           Main Page                         [main, page]   
1        Wikipedia:File upload wizard    [wikipedia, file, upload, wizard]   
2                           Main Page                         [main, page]   
3               Category talk:History            [category, talk, history]   
4  Library of Congress Classification  [library, congress, classification]   

                                    Paragraph_Tokens  
0  [margaret, sibella, brown, 1866, 1961, canadia...  
1  [thank, you, offering, contribute, image, othe...  
2  [margaret, sibella, brown, 1866, 1961, canadia...  
3  [quick, way, see, subcategories, articles, try...  
4  [library, congress, classification, lcc, syste...  


In [28]:

# Collecting unique terms and building per-document term frequency:
term_counts = []
vocab = set()

for tokens in df['Paragraph_Tokens']:
    counter = Counter(tokens)
    term_counts.append(counter)
    vocab.update(counter.keys())

vocab = sorted(vocab)
dtm_data = []

for counter in term_counts:
    row = [counter.get(term, 0) for term in vocab]
    dtm_data.append(row)

# Creating DTM:
dtm_df = pd.DataFrame(dtm_data, columns=vocab)
dtm_df['Topic'] = df['Topic'].values
dtm_df['Title'] = df['Page_Title'].values

print(dtm_df.head())


   000  10th  17th  1800s  1817  1820s  1825  1826  1832  1833  ...  yum  \
0    0     0     0      0     0      0     0     0     0     0  ...    0   
1    0     0     0      0     0      0     0     0     0     0  ...    0   
2    0     0     0      0     0      0     0     0     0     0  ...    0   
3    0     0     0      0     0      0     0     0     0     0  ...    0   
4    0     0     0      0     0      0     0     0     0     0  ...    0   

   zealand  zipper  ледокол  скопје  上杉謙信女性説  政治制度化  政治現代化    Topic  \
0        0       0        0       0        0      0      0  History   
1        0       0        0       0        0      0      0  History   
2        0       0        0       0        0      0      0  History   
3        0       0        0       0        0      0      0  History   
4        0       0        0       0        0      0      0  History   

                                Title  
0                           Main Page  
1        Wikipedia:File upload wizar

In [29]:

def cosine_similarity_manual(matrix):
    norms = np.linalg.norm(matrix, axis=1, keepdims=True)
    norm_matrix = matrix / norms
    similarity = np.dot(norm_matrix, norm_matrix.T)
    return similarity


In [30]:
tfidf_values = dtm_df.drop(columns=['Topic', 'Title']).values

similarity_matrix = cosine_similarity_manual(tfidf_values)


In [31]:
similarity_matrix

array([[1.        , 0.        , 1.        , ..., 0.03162278, 0.04364358,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 1.        , ..., 0.03162278, 0.04364358,
        0.        ],
       ...,
       [0.03162278, 0.        , 0.03162278, ..., 1.        , 0.11041049,
        0.03922323],
       [0.04364358, 0.        , 0.04364358, ..., 0.11041049, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.03922323, 0.        ,
        1.        ]])

In [32]:
thresholds = [0.2, 0.4, 0.6, 0.8]

for threshold in thresholds:
    
    binary_matrix = (similarity_matrix >= threshold).astype(int)

    fig = go.Figure(data=go.Heatmap(
        z=binary_matrix,  
        colorscale='Blues',  
        colorbar=dict(title="Similarity"),  
        showscale=True,  
    ))

    fig.update_layout(
        title=f'Similarity Matrix at Threshold {threshold}',
        xaxis_title="Document Index",
        yaxis_title="Document Index",
        height=600,  
    )

    fig.show()

In [None]:
# The similarity matrices show the documents compared to each other by copsine similarity. There are 200 wikipedia pages that are being compared to each other. In the matrices the 
# first 50 represent history pages, the next 50 are sports pages, then technology pages, and the last 50 are politics pages. We can see in the 0.2 threshold matrix that many more
# pages are similar. Mainly for the political pages there are many that are similar to each other. However, as the threshold raises we can see that the political pages are not very
# similar. There also seems to be strong groups that have simirities to each other mainly around the document indexes 25, 50, 100, amnd 150. 

In [33]:
true_labels = df['Topic'].tolist()  

In [34]:
predicted_labels = []

for i in range(len(df)):  
   
    similar_doc_index = similarity_matrix[i].argsort()[-2]  
    predicted_labels.append(true_labels[similar_doc_index])  



In [35]:

categories = ['History', 'Politics', 'Technology', 'Sports']

cm = np.zeros((len(categories), len(categories)), dtype=int)

for t, p in zip(true_labels, predicted_labels):
    t_idx = categories.index(t)  
    p_idx = categories.index(p)  
    cm[t_idx, p_idx] += 1  

cm_df = pd.DataFrame(cm, index=categories, columns=categories)
cm_df


Unnamed: 0,History,Politics,Technology,Sports
History,30,5,4,11
Politics,6,36,5,3
Technology,1,8,34,7
Sports,5,2,6,37


In [36]:


row_normalized_cm = cm_df.div(cm_df.sum(axis=1), axis=0)

column_normalized_cm = cm_df.div(cm_df.sum(axis=0), axis=1)

categories = cm_df.columns

fig_cm = go.Figure(data=go.Heatmap(
    z=cm_df.values,
    x=cm_df.columns,
    y=cm_df.index,
    colorscale='Viridis',
    zmin=0, zmax=cm_df.values.max(),
    colorbar=dict(title="Count")
))

fig_cm.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Labels",
    yaxis_title="True Labels",
    xaxis=dict(tickmode='array', tickvals=categories),
    yaxis=dict(tickmode='array', tickvals=categories)
)

fig_row_norm = go.Figure(data=go.Heatmap(
    z=row_normalized_cm.values,
    x=row_normalized_cm.columns,
    y=row_normalized_cm.index,
    colorscale='Viridis',
    zmin=0, zmax=1,
    colorbar=dict(title="Proportion")
))

fig_row_norm.update_layout(
    title="Row-normalized Confusion Matrix",
    xaxis_title="Predicted Labels",
    yaxis_title="True Labels",
    xaxis=dict(tickmode='array', tickvals=categories),
    yaxis=dict(tickmode='array', tickvals=categories)
)

fig_col_norm = go.Figure(data=go.Heatmap(
    z=column_normalized_cm.values,
    x=column_normalized_cm.columns,
    y=column_normalized_cm.index,
    colorscale='Viridis',
    zmin=0, zmax=1,
    colorbar=dict(title="Proportion")
))

fig_col_norm.update_layout(
    title="Column-normalized Confusion Matrix",
    xaxis_title="Predicted Labels",
    yaxis_title="True Labels",
    xaxis=dict(tickmode='array', tickvals=categories),
    yaxis=dict(tickmode='array', tickvals=categories)
)

fig_cm.show()
fig_row_norm.show()
fig_col_norm.show()


In [38]:

def create_clusters(similarity_matrix, threshold):
    n = len(similarity_matrix)
    visited = [False] * n
    clusters = []

    for i in range(n):
        if visited[i]:
            continue
        cluster = [i]
        visited[i] = True
        for j in range(n):
            if not visited[j] and similarity_matrix[i][j] > threshold:
                cluster.append(j)
                visited[j] = True
        clusters.append(cluster)
    return clusters

def generate_plotly_word_bar(cluster_indices, df, cluster_num):
    word_counts = Counter()
    for idx in cluster_indices:
        tokens = df.loc[idx, 'Paragraph_Tokens']
        word_counts.update(tokens)

    most_common = word_counts.most_common(20)
    words, freqs = zip(*most_common) if most_common else ([], [])

    fig = px.bar(
        x=words,
        y=freqs,
        title=f"Top Words in Cluster {cluster_num + 1}",
        labels={"x": "Word", "y": "Frequency"}
    )
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

threshold = 0.05

# Creating the clusters
clusters = create_clusters(similarity_matrix, threshold)

# Visualizing each cluster:
for i, cluster_indices in enumerate(clusters):
    print(f"Cluster {i + 1} with {len(cluster_indices)} articles")
    generate_plotly_word_bar(cluster_indices, df, i)


Cluster 1 with 28 articles


Cluster 2 with 26 articles


Cluster 3 with 4 articles


Cluster 4 with 21 articles


Cluster 5 with 12 articles


Cluster 6 with 6 articles


Cluster 7 with 5 articles


Cluster 8 with 1 articles


Cluster 9 with 16 articles


Cluster 10 with 5 articles


Cluster 11 with 4 articles


Cluster 12 with 3 articles


Cluster 13 with 1 articles


Cluster 14 with 4 articles


Cluster 15 with 8 articles


Cluster 16 with 2 articles


Cluster 17 with 1 articles


Cluster 18 with 2 articles


Cluster 19 with 3 articles


Cluster 20 with 7 articles


Cluster 21 with 1 articles


Cluster 22 with 2 articles


Cluster 23 with 2 articles


Cluster 24 with 2 articles


Cluster 25 with 2 articles


Cluster 26 with 3 articles


Cluster 27 with 5 articles


Cluster 28 with 1 articles


Cluster 29 with 5 articles


Cluster 30 with 3 articles


Cluster 31 with 1 articles


Cluster 32 with 2 articles


Cluster 33 with 1 articles


Cluster 34 with 1 articles


Cluster 35 with 3 articles


Cluster 36 with 1 articles


Cluster 37 with 1 articles


Cluster 38 with 1 articles


Cluster 39 with 1 articles


Cluster 40 with 1 articles


Cluster 41 with 1 articles


Cluster 42 with 1 articles


In [None]:

def generate_wordcloud_plotly(cluster_indices, df, cluster_num):
    # Collect all tokens from the specified cluster indices
    all_tokens = []
    for idx in cluster_indices:
        all_tokens.extend(df.loc[idx, 'Paragraph_Tokens'])
    
    # Count word frequencies
    word_counts = Counter(all_tokens)
    most_common = word_counts.most_common(100)  # Get the 100 most common words
    words, freqs = zip(*most_common) if most_common else ([], [])

    # Generate random positions for words
    positions = []
    for _ in range(len(words)):
        x = random.uniform(-1, 1)  # Random x position
        y = random.uniform(-1, 1)  # Random y position
        positions.append((x, y))

    # Scale the font size based on frequency (larger frequency -> larger font)
    max_freq = max(freqs) if freqs else 1
    font_sizes = [20 + (freq / max_freq) * 80 for freq in freqs]

    # Create a scatter plot for the word cloud
    fig = go.Figure()

    for word, (x, y), font_size in zip(words, positions, font_sizes):
        fig.add_trace(go.Scatter(
            x=[x], y=[y],
            text=word,
            mode='text',
            textfont=dict(size=font_size, family="Arial", color="black"),
            showlegend=False
        ))

    # Update layout
    fig.update_layout(
        title=f"Word Cloud for Cluster {cluster_num + 1}",
        xaxis=dict(showgrid=False, zeroline=False, range=[-1, 1]),
        yaxis=dict(showgrid=False, zeroline=False, range=[-1, 1]),
        showlegend=False,
        plot_bgcolor='white',
        margin=dict(l=0, r=0, t=40, b=0)
    )

    fig.show()


In [46]:
for i, cluster_indices in enumerate(clusters):
    print(f"Cluster {i + 1} with {len(cluster_indices)} articles")
    generate_wordcloud_plotly(cluster_indices, df, i) 


Cluster 1 with 28 articles


Cluster 2 with 26 articles


Cluster 3 with 4 articles


Cluster 4 with 21 articles


Cluster 5 with 12 articles


Cluster 6 with 6 articles


Cluster 7 with 5 articles


Cluster 8 with 1 articles


Cluster 9 with 16 articles


Cluster 10 with 5 articles


Cluster 11 with 4 articles


Cluster 12 with 3 articles


Cluster 13 with 1 articles


Cluster 14 with 4 articles


Cluster 15 with 8 articles


Cluster 16 with 2 articles


Cluster 17 with 1 articles


Cluster 18 with 2 articles


Cluster 19 with 3 articles


Cluster 20 with 7 articles


Cluster 21 with 1 articles


Cluster 22 with 2 articles


Cluster 23 with 2 articles


Cluster 24 with 2 articles


Cluster 25 with 2 articles


Cluster 26 with 3 articles


Cluster 27 with 5 articles


Cluster 28 with 1 articles


Cluster 29 with 5 articles


Cluster 30 with 3 articles


Cluster 31 with 1 articles


Cluster 32 with 2 articles


Cluster 33 with 1 articles


Cluster 34 with 1 articles


Cluster 35 with 3 articles


Cluster 36 with 1 articles


Cluster 37 with 1 articles


Cluster 38 with 1 articles


Cluster 39 with 1 articles


Cluster 40 with 1 articles


Cluster 41 with 1 articles


Cluster 42 with 1 articles
