Using information from https://scholia.toolforge.org/work/Q108799588 and JSON file from https://github.com/DominikFilipiak/imagenet-to-wikidata-mapping

# Load mappings

In [None]:
import json
import os

directory = 'C:\\Users\\mikke\\OneDrive\\Dokumenter\\DTU documents\\7. semester\\Bachelor projekt\\imagenet-to-wikidata-mapping-main\\imagenet-to-wikidata-mapping-main'
fname = os.path.join(directory, 'mapping.json')

with open(fname, 'r') as f_obj:
    mappings = json.load(f_obj)

for k, v in mappings.items():
    mappings[k] = v.split('/')[-1]

# Get Wikipedia description

In [None]:
from wikidata.client import Client

client = Client()
entity = client.get(list(mappings.values())[0], load=True)

In [None]:
print(entity.data.keys())
entity.data['labels']['en']['value']

In [None]:
entity.data['sitelinks']['enwiki']['url']

## Using a better wikipedia API wrapper
https://github.com/martin-majlis/Wikipedia-API

In [None]:
import wikipediaapi as wiki
import sys

wiki_en = wiki.Wikipedia('en')

articles = {}
failures = []

# Cache in case we would like to modify the policy and not have to make new queries.
cache_wikidata = {}
cache_wikipedia = {}

use_cache = True

i = 0
for k, v in mappings.items():
    i+=1
    sys.stdout.flush()
    sys.stdout.write('\rProgress: {:.1f}%'.format(i / 1000 * 100))
    entity = None
    if v not in cache_wikidata.keys() or not use_cache:
        entity = client.get(v, load=True)
        cache_wikidata[v] = entity
    else:
        entity = cache_wikidata[v]
    title = None
    
    # First look up Wikipedia page in Wikidata. Else search Wikipedia for its title. Else count as a failure
    if 'enwiki' in entity.data['sitelinks'].keys():
        title = entity.data['sitelinks']['enwiki']['title']
    elif 'en' in entity.data['labels']:
        title = entity.data['labels']['en']['value']
    else:
        failures.append(k)
        continue
        
    if title not in cache_wikipedia.keys() or not use_cache:
        article = wiki_en.page(title)
        cache_wikipedia[title] = article
    else:
        article = cache_wikipedia[title]
    if not article.exists():
        failures.append(k)
    else:
        articles[k] = article
    
print('\n')
print("Number of failures: {}".format(len(failures)))
print("Failures:")
print(failures)

### A closer look at the failures

In [None]:
for failure in failures:
    wikidata_id = mappings[failure]
    print(client.get(wikidata_id, load=True))

In [None]:
client.get(mappings[failures[0]], load=True).data['labels']['en']['value']

In [None]:
import nltk
from nltk.corpus import wordnet as wn

nltk.download('wordnet')

mappings_reversed = {v:k for k, v in mappings.items()}

get_synset_from_id = lambda synset_id: wn.synset_from_pos_and_offset(synset_id[0], int(synset_id[1:]))
get_title_from_synset = lambda synset: synset.name().split('.')[0].replace('_', ' ')

for failure in failures:
    print('WordNet id: {} - Title: {}'.format(failure, get_title_from_synset(get_synset_from_id(failure))))

Interestingly enough, some of the titles are not the same! I wonder what implications this might have.

#### Fixing the failures
- Racer could perhaps be substituted by Sports car?
- Screen refers to 'the display that is electronically created on the surface of the large end of a cathode-ray tube'
- etc.
- Vestment exists on Wikipedia. I guess some information was lost.

Perhaps I should just leave those alone?

In [None]:
get_synset_from_id('n04152593').definition()

#### Investigating the difference in the titles

In [None]:
for wnid, article in articles.items():
    wordnet_title = get_title_from_synset(get_synset_from_id(wnid)).lower()
    article_title = article.title.lower()
    if wordnet_title not in article_title and wordnet_title not in article.summary.lower():
        print("WordNet id: {} - WordNet title: {} - Wikipedia title: {}".format(wnid, wordnet_title, article_title))

Number of discrepancies: 125<br>

It does seem that most of them are alright, however for e.g. 'green mamba', it has been linked simply to 'mamba' although there are multiple pages on Wikipedia about the different green mamba species.

## Getting the descriptions

In [None]:
articles['n04548280'].summary  # Should have been a wallclock, but is instead a clock

In [None]:
articles['n04548280'].summary.split('.')

In [None]:
def prepare_sentences(desc):
    s = desc.split('.')
    sentences = []
    for i, sentence in enumerate(s):
        if sentence != '':
            sentences.append(sentence.replace('\n', '').lstrip())
    return sentences
        
prepare_sentences(articles['n03297495'].summary)

Idea: Perhaps we should just mask out the actual word in the validation set?

In [None]:
summaries = {}
length = len(articles)
i=0
for k, v in articles.items():
    i+=1
    sys.stdout.flush()
    sys.stdout.write('\rProgress: {:.1f}%'.format((i+1)/length * 100))
    summaries[k] = v.summary#.replace('\n', ' ')
    
summaries

In [None]:
with open('wiki_descriptions.json', 'w') as f_obj:
    json.dump(summaries, f_obj)

In [None]:
import os
os.getcwd()

In [None]:
with open('wiki_descriptions.json', 'r') as f_obj:
    desc_loaded = json.load(f_obj)


In [None]:
length_sum = 0
no_sentences = 0  # Should be 1000
max_length = 0
min_length = 10**10  # Just sufficiently large
lengths = []

for label, definition in desc_loaded.items():
    for s in definition.replace('\n', ' ').split('. '):
        length = len(s.split())
        length_sum += length
        no_sentences += 1
        if length > max_length:
            max_length = length
        if length < min_length:
            min_length = length
        lengths.append(length)

print("Average length of hints: {}".format(length_sum/no_sentences))
print("Max length: {}\t Min length: {}".format(max_length, min_length))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(9,5))
plt.hist(lengths, bins=np.arange(0,max_length+1,2), edgecolor='black')
plt.title("Distribution of the lengths of sentences from the Wikipedia descriptions")
plt.xlabel("Sentence length (no. words)")
plt.ylabel("No. occurrences")
plt.xlim([0,max_length])
#plt.axis('off')
#plt.gca().set_position([0, 0, 1, 1])
plt.savefig("wikipediadistr.svg")

In [None]:
len(lengths)

import os 

os.getcwd()