In [10]:
# Libraries (will need to export to requirements.txt)

from SPARQLWrapper import SPARQLWrapper, JSON
import wikipedia
from tqdm import tqdm
import os
import json
import pandas as pd
# import collections
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
# import glob
import re

import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/oumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/oumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Pull 100 biographies from wikipedia for each category

In [11]:
DATA_PATH = 'data/'
CATEGORY1 = 'Sculptors'
CATEGORY2 = 'Computer_scientists'

In [12]:
def get_people(profession):
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    
    modified_profession = profession.replace(' ', '_').replace("'", '%27')
    
    query = """
            Select Distinct ?person Where {
                ?person dbo:occupation dbr:%s .
            }
            """ % modified_profession
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    ret = sparql.queryAndConvert()
    return(ret['results']['bindings'])

In [13]:
def get_rdf_triples(person):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    dbpedia_resource = "http://dbpedia.org/resource/" + person
    query = "SELECT ?predicate ?object WHERE { <" + dbpedia_resource + "> ?predicate ?object.}"
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    triples = []
    for result in results["results"]["bindings"]:
        subj = "<"+dbpedia_resource+">"
        pred = result["predicate"]["value"]
        obj = result["object"]["value"]
        triples.append({'subject': subj, 'predicate': pred, 'object': obj})
    return triples

In [14]:
def get_biographies(people, category, limit):
    pages = []
    i = 0
    if not os.path.exists(DATA_PATH):
        print('Data Directory did not exist, creating it now...')
        os.mkdir(DATA_PATH)

    with tqdm(total=limit) as pbar:
        while len(pages) < limit:
            if i >= len(people):
                break
            person = people[i]['person']['value'].split('/')[-1]
            try:
                page = wikipedia.page(person).content
                pages.append(page)
                file_path = os.path.join(DATA_PATH, person + '_' + category.replace(' ', '_'))
                with open(file_path + '.txt', 'w') as f:
                    f.write(page)
                triples = get_rdf_triples(person)
                with open(file_path + '.json', 'w') as f:
                    json.dump(triples, f, indent=4)
                pbar.update(1)
            except Exception as e:
                pass
            i += 1
    return pages

In [15]:
def create_dataset(cat1, cat2, limit=999):
    people1, people2 = (get_people(cat1), get_people(cat2))
    texts1, texts2 = (get_biographies(people1, cat1, limit), get_biographies(people2, cat2, limit))
    df = pd.DataFrame({'text' : texts1+texts2})
    df['category'] = pd.Series([cat1]*len(texts1) + [cat2]*len(texts2))
    return df, texts1, texts2

In [17]:
df, texts1, texts2 = create_dataset(CATEGORY1, CATEGORY2, limit=100)

  1%|          | 1/100 [00:01<01:48,  1.10s/it]
  0%|          | 0/100 [00:00<?, ?it/s]


In [None]:
def clean_text_complete(text):
    dirty_chars = '{}[]()|\/=+`"*,.<>;:#$%&' + "'"
    for char in dirty_chars:
        text = text.replace(char, ' ')
    return text.lower()

In [None]:
def tokenize(text):
    return nltk.word_tokenize(text)

In [None]:
df.head()

In [None]:
df['clean_text'] = df['text'].apply(clean_text_complete)

In [None]:
df['tokenized'] = df['clean_text'].apply(tokenize)

In [None]:
# Create 50 most common word per category and word cloud

In [None]:
def get_all_tokens(df, category):
    tokens = []
    for series in df[df["category"] == category]['tokenized']:
        for t in series:
            tokens.append(t)
    return tokens

In [None]:
tokens1 = get_all_tokens(df, CATEGORY1)
tokens2 = get_all_tokens(df, CATEGORY2)

In [None]:
def remove_stopwords(tokens, stop_words):
    '''
    This function removes stopwords from a list of tokens.
    Parameters:
    - tokens = list of tokens.
    - stop_words = list of stopwords
    Output: 
    - tokens = list of tokens without stopwords 
    '''
    for token in tokens:
        if token in stop_words:
            tokens.remove(token)
    return tokens

In [None]:
stopwords_en = stopwords.words('english') #I added into a list variable to fix the function.

In [None]:
tokens1_nostop = remove_stopwords(tokens1, stopwords_en)
tokens2_nostop = remove_stopwords(tokens2, stopwords_en)

In [None]:
def make_wordcloud(tokens, n_words):
    data = collections.Counter(tokens).most_common(n_words)
    word_frequencies = {word: freq for word, freq in data}

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
make_wordcloud(tokens1_nostop, 50)

In [None]:
make_wordcloud(tokens2_nostop, 50)

In [None]:
# Create histogram and boxplot for sentence length per category with min/max/avg

Attention:
This changed after I fixed the stopwords function. 
We need to review it carefully because I don't see the tokens element here

In [None]:
df.head()

In [None]:
def get_sentences(series):
    sentences = []
    for item in series:
        for para in item.split('\n'):
            for sent in nltk.sent_tokenize(para):
                sentences.append(sent)
    return sentences

In [None]:
sentences1 = get_sentences(df[df['category'] == CATEGORY1]['text']) 
sentences2 = get_sentences(df[df['category'] == CATEGORY2]['text'])

In [None]:
def get_length_dict(sentences):
    lengths = {}
    longest_sentence = ""
    shortest_sentence = "Pretty sure it's smaller than this"
    for sentence in sentences:
        len_sentence = len(sentence.split(' '))
        if len_sentence > len(longest_sentence.split(' ')):
            longest_sentence = sentence
        if len_sentence < len(shortest_sentence.split(' ')):
            shortest_sentence = sentence
        if lengths.get(len_sentence):
            lengths[len_sentence] += 1
        else:
            lengths[len_sentence] = 1
    expanded = []
    for key, count in lengths.items():
        expanded.extend([key] * count)
    return lengths, expanded, longest_sentence, shortest_sentence

In [None]:
length_sentence1, expanded1, longest_sentence1, shortest_sentence1 = get_length_dict(sentences1)
length_sentence2, expanded2, longest_sentence2, shortest_sentence2 = get_length_dict(sentences2)

In [None]:
print('Min Sentence Length Category 1:', min(length_sentence1))
print('Max Sentence Length Category 1:', max(length_sentence1))
print('Avg Sentence Length Category 1:', sum(length_sentence1) / len(length_sentence1))
print('Min Sentence Length Category 2:', min(length_sentence2))
print('Max Sentence Length Category 2:', max(length_sentence2))
print('Avg Sentence Length Category 2:', sum(length_sentence2) / len(length_sentence2))

In [None]:
plt.figure(figsize=(10, 6))
plt.boxplot(expanded1)
plt.title("Box Plot of the Sentence Length of Category 1")
plt.ylabel("Length of Sentence")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.boxplot(expanded2)
plt.title("Box Plot of the Sentence Length of Category 2")
plt.ylabel("Length of Sentence")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(expanded1)
plt.title("Histogram of the Sentence Length of Category 1")
plt.ylabel("Frequency")
plt.xlabel("Length of Sentence (Words)")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(expanded2)
plt.title("Histogram of the Sentence Length of Category 2")
plt.ylabel("Frequency")
plt.xlabel("Length of Sentence (Words)")
plt.show()

In [None]:
# ?? Total number of bi-gram occurences per category. min/max/avg

In [None]:
def collect_bigrams(tokens):
    bigrams = nltk.bigrams(tokens)
    occurences = {}
    for bigram in bigrams:
        if occurences.get(bigram):
            occurences[bigram] += 1
        else:
            occurences[bigram] = 1
    return occurences

In [None]:
bigrams1 = collect_bigrams(tokens1)
bigrams2 = collect_bigrams(tokens2)

In [None]:
bigrams_longest1 = collect_bigrams(longest_sentence1.split(' '))
bigrams_longest2 = collect_bigrams(longest_sentence2.split(' '))

In [None]:
bigrams_shortest1 = collect_bigrams(shortest_sentence1.split(' '))
bigrams_shortest2 = collect_bigrams(shortest_sentence2.split(' '))

In [None]:
print("Number of unique bigram occurences in Category 1:", len(bigrams1))
print("\tUnique bigrams in shortest:\t\t", len(bigrams_shortest1))
print("\tUnique bigrams in longest:\t\t", len(bigrams_longest1))
print("Number of unique bigram occurences in Category 2:", len(bigrams2))
print("\tUnique bigrams in shortest:\t\t", len(bigrams_shortest1))
print("\tUnique bigrams in longest:\t\t", len(bigrams_longest1))

In [None]:
def load_data(category):
    files = glob.glob(f'./data/*{category}*.json') 
    data = []
    for file in files:
        with open(file, 'r') as f:
            data.extend(json.load(f))
    return data

def format_property_name(uri):
    name = uri.split('/')[-1]
    name = name.split('#')[-1]
    name = name.replace('_', ' ')
    replacements = {
        'PageID': 'Page ID',
        'URL': 'URL',
        'birthDate': 'Birth Date',
        'deathDate': 'Death Date',
        'birthPlace': 'Birth Place',
        'deathPlace': 'Death Place',
        'sameAs': 'Same As'
    }
    for key, value in replacements.items():
        name = name.replace(key, value)
    return name

def count_properties(data, exclude):
    property_counter = Counter()
    for triple in data:
        property_uri = triple['predicate']
        if property_uri not in exclude:
            simple_property_name = format_property_name(property_uri)
            property_counter[simple_property_name] += 1
    return property_counter

def make_wordcloud(properties_counter, category):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(properties_counter)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {category}')
    plt.show()
#I excluded all the cases that were not relevant to describe the person. However: it does still look kinda ugly.
exclusions = [
    "http://dbpedia.org/ontology/wikiPageExternalLink",
    "http://dbpedia.org/ontology/wikiPageRevisionID",
    "http://dbpedia.org/ontology/wikiPageWikiLink",
    "http://dbpedia.org/property/wikiPageUsesTemplate",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
    "http://www.w3.org/2000/01/rdf-schema#label",
    "http://www.w3.org/2000/01/rdf-schema#comment",
    "http://www.w3.org/2000/01/rdf-schema#comment",
    "http://dbpedia.org/ontology/wikiPageLength",
    "http://dbpedia.org/ontology/wikiPageID",
    "http://xmlns.com/foaf/0.1/isPrimaryTopicOf",
    "http://www.w3.org/ns/prov#wasDerivedFrom"
]

for category in ['Rabbi', 'Drag_queen']:
    data = load_data(category)
    properties = count_properties(data, exclusions)
    make_wordcloud(properties, category)


In [None]:
# Create histograms and boxplot for number of facts per category. min/max/avg

def load_and_count_facts(category):
    files = glob.glob(f'./data/*{category}*.json')
    fact_counts = []
    for file in files:
        with open(file, 'r') as f:
            data = json.load(f)
        unique_facts = set((fact['subject'], fact['predicate'], fact['object']) for fact in data)
        fact_counts.append(len(unique_facts))
    return fact_counts

def visualize_data(categories):
    for category in categories:
        fact_counts = load_and_count_facts(category)
        print(f"Statistics for {category}:")
        print(f"Minimum facts: {min(fact_counts)}")
        print(f"Maximum facts: {max(fact_counts)}")
        print(f"Average facts: {sum(fact_counts) / len(fact_counts)}")

        
        plt.figure(figsize=(10, 4))
        plt.hist(fact_counts, bins=30, alpha=0.7, label=f'{category} Facts')
        plt.title(f'Histogram of Fact Counts for {category}')
        plt.xlabel('Number of Facts')
        plt.ylabel('Frequency')
        plt.show()

        
        plt.figure(figsize=(5, 4))
        plt.boxplot(fact_counts, vert=False)
        plt.title(f'Boxplot of Fact Counts for {category}')
        plt.xlabel('Number of Facts')
        plt.show()


categories = ['Rabbi', 'Drag_queen'] #For some reason the Rabbi's graphs look *awful*


visualize_data(categories)


In [None]:
#To linearize the RDFs 

import json
import glob

def load_and_linearize(category):
    path_pattern = f'./data/*{category}*.json'
    linearized_data = []

    for file_path in glob.glob(path_pattern):
        with open(file_path, 'r') as file:
            data = json.load(file)
            for triple in data:
                
                subject = extract_and_clean(triple['subject'])
                predicate = extract_and_clean(triple['predicate'])
                object_ = extract_and_clean(triple['object'])
                
               
                linearized_string = f"{subject} {predicate} {object_}"
                linearized_data.append(linearized_string)

    return linearized_data

def extract_and_clean(uri):
    
    parts = uri.split('/')
    last_part = parts[-1]
    
    if '#' in last_part:
        last_part = last_part.split('#')[-1]
    
    return last_part.replace('_', ' ').replace('-', ' ')


categories = ['Rabbi', 'Drag_queen']
linearized_data = {}

for category in categories:
    linearized_data[category] = load_and_linearize(category)


linearized_rabbi = linearized_data['Rabbi']
linearized_drag_queen = linearized_data['Drag_queen']

### To clean the elements of each list ### 

#remove repetitions
linearized_rabbi_rdf = list(set(linearized_rabbi))
linearized_drag_queen_rdf = list(set(linearized_drag_queen))

#remove > symbol

linearized_rabbi_rdf = [re.sub('>', '', rabbi) for rabbi in linearized_rabbi_rdf]
linearized_drag_queen_rdf = [re.sub('>', '', dg) for dg in linearized_drag_queen_rdf]


In [None]:
# Train KMeans clustering (n_clusters=2). Cluster based on text (?) and, separately, on facts


In [None]:
# Compute supervised and unsupervised metrics and visualizations to compare results

In [None]:
# Function that process a document and return the set of named entities(NEs) (Spacy)

In [None]:
# Same as above but with Stanza

In [None]:
# Get statistics for each output (avg/min/max number of NEs)

In [None]:
# Get statistics for each output (avg/min/max number of words (unclear use of word 'word'))

In [None]:
# Visualize the above 2 statistics (per category, per package)

In [None]:
# Write a function that takes a single document and performs the following 5 tasks

In [None]:
    # Returns # of spans that both packages agree on

In [None]:
    # Returns # of partial agreements on spans

In [None]:
    # Returns for each package # of spans that one package predicted as a NE and the other did not

In [None]:
    # Returns for all spans with full and partial agreement, agreement about NE types (aka Person, Location, Organisation, etc.)

In [None]:
    # Visualize the above statistics

In [None]:
# For each set of NEs predicted by each package, how many of them can be found in the knowledge graph (KG)

In [None]:
# For each package, the ratio of predicted NEs that can be found in the KG of a person

In [None]:
# Bonus points