In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
import wget
import argparse
import os
import re
import wikipediaapi as wikiapi
import pickle
import nltk
import string

# Data selection and creation
To successfully add natural language knowledge to the program, we have to first focus on where and how we get the information. Here, this is done by checking every single subject and replacing the link with a wikipedia link, so the WikipediaAPI can extract the summary or the content from the wikipedia page. There are two sections in this notebook. The first one focusses on just extracting the summary/the first paragraph/the abstract of the subjects wikipedia page. The second one extracts everything EXCEPT the abstract. At the end of each data cleaning step a pickle file is created to store the data along with the subject and class and i saved in the "dataset" folder of the specific dataset.

In [2]:
parser = argparse.ArgumentParser(
    description='Ridle, learning a representation for entities using a target distributions over the usage of relations.',
)
#parser.add_argument('--dataset', nargs='?', default='DBp_2016-04', type=str)
parser.add_argument('--dataset', nargs='?', default='Universities_Dbpedia', type=str)
parser, unknown = parser.parse_known_args()

# https://www.dropbox.com/sh/szvuv79ubfqgmn5/AACHxl_eC0frcGrZpVy0VDQPa?dl=0


links = {}
links['dblp'] = 'https://www.dropbox.com/s/78srst5bjt2tta1/dataset.pkl?dl=1'
links['dbp_type_mapping'] = 'https://www.dropbox.com/s/2ec6dyr90pmjfm9/dbp_type_mapping.json?dl=1'
links['umls'] = 'https://www.dropbox.com/s/madbrirjc3yjtru/dataset.pkl?dl=1'
links['Person_DBpedia'] = 'https://www.dropbox.com/s/1omj2btnoj8g4xa/dataset.pkl?dl=1'
links['DBp_2016-04'] = 'https://www.dropbox.com/s/z38exis1ah3q5ze/dataset.pkl?dl=1'
links['Company_DBpedia'] = 'https://www.dropbox.com/s/bft3hmk2m6ecrkl/dataset.pkl?dl=1'
links['Songs_DBpedia'] = 'https://www.dropbox.com/s/u9k6qaydqowckae/dataset.pkl?dl=1'
links['Books_DBpedia'] = 'https://www.dropbox.com/s/wdqhov2g4bvwzr9/dataset.pkl?dl=1'
links['ChemicalCompounds_DBpedia'] = 'https://www.dropbox.com/s/fyyqgtwwf2pnj3b/dataset.pkl?dl=1'
links['Universities_DBpedia'] = 'https://www.dropbox.com/s/0g2moh3puz09uoy/dataset.pkl?dl=1'


if not os.path.isfile('./dataset/dbp_type_mapping.json'):
    print("Downloading dbp_type_mapping data.")
    data_url = links['dbp_type_mapping']
    wget.download(data_url, './dataset/dbp_type_mapping.json')


if not os.path.isfile('./dataset/{}/dataset.pkl'.format(parser.dataset)):
    print("Downloading {} data.".format(parser.dataset))
    data_url = links[parser.dataset]
    Path('./dataset/{}'.format(parser.dataset)).mkdir(parents=True, exist_ok=True)
    wget.download(data_url, './dataset/{}/dataset.pkl'.format(parser.dataset))



print('Loading files on', parser.dataset)
# Loading Files
df = pd.read_pickle('./dataset/{}/dataset.pkl'.format(parser.dataset))[['S', 'P']].drop_duplicates()

s = df["S"]

#create list from pandas series object
link_list = s.tolist()

#remove the duplicates from list
link_list_cleaned = list(dict.fromkeys(link_list))

#replace dbpedia with wikipedia
wiki_list = []
for link in link_list_cleaned:
    temp = re.sub("http://dbpedia.org/resource/", 'https://en.wikipedia.org/wiki/', link)
    wiki_list.append(temp)

#create list with resource for wikipedia api
resource_list = []
for res in link_list_cleaned:
    temp = re.sub("http://dbpedia.org/resource/", '', res)
    resource_list.append(temp)

print("Create mapping...")
if 'dbp' in parser.dataset.lower():
    mapping = pd.read_json('./dataset/dbp_type_mapping.json')
elif 'wd' in parser.dataset.lower() or 'wikidata' in parser.dataset.lower():
    mapping = pd.read_json('./dataset/wd_mapping_type.json')
else:
    mapping = pd.read_json('./dataset/{}/type_mapping.json'.format(parser.dataset))
    
#command for wikipedia article extraction
wiki_wiki = wikiapi.Wikipedia(language='en', extract_format=wikiapi.ExtractFormat.WIKI)

Loading files on Universities_Dbpedia
Create mapping...


### Clean the data


In [3]:
nltk.download('words')
words = set(nltk.corpus.words.words())

def clean_text(text):
    '''
    This function removes punctuation, words containing numbers
    as well as making the whole text lower-case
    '''
    text = text.lower()
    #text = text.encode("ascii", errors="ignore").decode()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\n', '', text)
    text = re.sub("references", "", text)
    #text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())
    return text

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Dorian\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


### Create summary dataset

In [4]:
#create dictionary with keys: Objects and values: summary of wikipedia page
#[:1000] after resource_list & link_list_cleaned to access a portion of the dataset. Change number dependent on the size you wish
my_file = {}
for items in resource_list:
    my_file[items] = clean_text(wiki_wiki.page(items).summary)
pd.set_option('max_colwidth',200)

data_clean = pd.DataFrame.from_dict(my_file, orient='index')
data_clean.columns = ['Summary']
data_clean['S'] = data_clean.index
#data_clean = data_clean.sort_index()
data_clean['S'] = link_list_cleaned
data_clean

Unnamed: 0,Summary,S
"1_Decembrie_1918_University,_Alba_Iulia",decembrie university alba iulia is a public higher education and research institution founded in in alba iulia romania it is a state institution integrated into the national higher educat...,"http://dbpedia.org/resource/1_Decembrie_1918_University,_Alba_Iulia"
42_(school),is a private nonprofit and tuition free computer programming school created and funded by french billionaire xavier niel founder of the telecommunication company iliad with several partners in...,http://dbpedia.org/resource/42_(school)
A.T._Still_University,a t still university atsu is a private medical school based in kirksville missouri with a second campus in arizona it was founded in by dr andrew taylor still and was the world s first ost...,http://dbpedia.org/resource/A.T._Still_University
A.T._Still_University_School_of_Osteopathic_Medicine_in_Arizona,a t still university school of osteopathic medicine in arizona atsu soma is a private medical school in mesa arizona it was established in and is on the arizona campus of a t still univer...,http://dbpedia.org/resource/A.T._Still_University_School_of_Osteopathic_Medicine_in_Arizona
A.V.V.M_Sri_Pushpam_College,a veeriya vandayar memorial sri pushpam college is an arts and sciences autonomous college in thanjavur district tamil nadu india established in it offers higher education in arts science a...,http://dbpedia.org/resource/A.V.V.M_Sri_Pushpam_College
...,...,...
West_University_of_Timi%C8%99oara,,http://dbpedia.org/resource/West_University_of_Timi%C8%99oara
Wroc%C5%82aw_Medical_University,,http://dbpedia.org/resource/Wroc%C5%82aw_Medical_University
Xavier_University_%E2%80%93_Ateneo_de_Cagayan,,http://dbpedia.org/resource/Xavier_University_%E2%80%93_Ateneo_de_Cagayan
Y%C4%B1ld%C4%B1r%C4%B1m_Beyaz%C4%B1t_University,,http://dbpedia.org/resource/Y%C4%B1ld%C4%B1r%C4%B1m_Beyaz%C4%B1t_University


In [5]:
data_clean['S'] = link_list_cleaned

In [6]:
print('Creating input file...')
print('Merging summary input data...')
r = pd.merge(data_clean, mapping, on='S')
cols = r.columns.tolist()
cols = cols[-1:] + cols[:-1]

input_data = r[['S', 'Summary', 'Class']]
input_data.to_pickle('./dataset/{}/input_data_unclean.pkl'.format(parser.dataset))
print('Input file created successfully')

Creating input file...
Merging summary input data...
Input file created successfully


### Create content data

In [16]:
#create dictionary with keys: Objects and values: summary of wikipedia page
my_corpus = {}
i = 0
for items in resource_list:
    temp = wiki_wiki.page(items).text
    temp = temp.split("\n\n", 1)
    temp = temp[1:]
    temp = ' '.join([str(elem) for elem in temp])
    temp = clean_text(temp)
    my_corpus[items] = temp
    i += 1

corpus_new = pd.DataFrame.from_dict(my_corpus, orient='index')
corpus_new.columns = ['Contents']
corpus_new = corpus_new.sort_index()
corpus_new = corpus_new.sort_index()
corpus_new['S'] = link_list_cleaned

# merge them
print('Merging contents input data...')
r = pd.merge(data_clean, mapping, on='S')

Merging contents input data...


Unnamed: 0,Contents,S
"%C2%A1Ay,_qu%C3%A9_deseo!",,http://dbpedia.org/resource/'97_Bonnie_&_Clyde
"%C2%A1Qu%C3%A9_bueno,_qu%C3%A9_bueno!",,http://dbpedia.org/resource/'Cuz_I_Can_(Pink_song)
%C2%BD_Full,,http://dbpedia.org/resource/'Round_Midnight_(song)
%C2%BFQu%C3%A9_voy_a_hacer_sin_ti%3F,,http://dbpedia.org/resource/'S_Wonderful
%C2%BFQui%C3%A9n_maneja_mi_barca%3F,,"http://dbpedia.org/resource/'The_Half_of_It,_Dearie'_Blues"
...,...,...
Zooropa_(song),"background and recordingduring the zoo tv tour in , were trying to create a vision of an attractive future for europe, as opposed to a negative, dystopian image that would be found in science fic...",http://dbpedia.org/resource/You_Are_the_Only_One_(Ivan_Mikuli%C4%87_song)
Zor_and_Zam,"personnelmicky dolenz - lead vocal, percussionkeith allison and bill chadwick - electric guitarschip douglas, richard dey and max bennett - bassmichael melvoin - pianoeddie hoh - drumshal blaine, ...",http://dbpedia.org/resource/Za_na%C5%A1u_ljubav
Zorbas,== references ==,http://dbpedia.org/resource/Zem_menom_l%C3%A1ska
Zusammen_geh'n,charts== references ==,http://dbpedia.org/resource/Zemr%C3%ABn_e_lam%C3%AB_peng


In [17]:
print('Creating input file...')
r = pd.merge(corpus_new, mapping, on='S')
cols = r.columns.tolist()
cols = cols[-1:] + cols[:-1]

input_data = r[['S', 'Contents', 'Class']]
input_data.to_pickle('./dataset/{}/input_data_contents_full.pkl'.format(parser.dataset))
print('Input file created successfully')

Creating input file...
Input file created successfully


## Document-Term Matrix for further research

In [5]:
data = pd.read_pickle('dataset/DBp_2016-04/input_data_summary_full.pkl')
data_clean = data['Summary'][:10000]

In [6]:
#using CountVectorizer to remove stopwords and tokenize the text to build the document-term matrix 
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm




Unnamed: 0,aa,aaa,aaadonta,aabsal,aacm,aacsb,aaden,aadigere,aadmabaad,aaf,...,zwingen,zx,zygaenidae,zygophyllum,zygoptera,zyl,zymology,zyrat,zz,zzhxin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#pickle the document term matrix for later use
data_dtm.to_pickle('./dataset/DBp_2016-04/DTM.pkl')
#Also pickle the cleaned data and the CountVectorizer object
pickle.dump(cv, open("./dataset/DBp_2016-04/cv.pkl", "wb"))


In [11]:
data = pd.read_pickle('dataset/Books_Dbpedia/DTM.pkl')
data

Unnamed: 0,aa,aaas,aadujeevitham,aai,aair,aalahayude,aalohari,aamukham,aanslag,aansprekers,...,zwecker,zwei,zweig,zwick,zwischen,zwlf,zygmunt,zyrgon,zz,zzh
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
