In [None]:
from google.colab import drive, userdata
import os
drive.mount('/content/drive')
drive_dir = '/content/drive/My Drive/'
data_dir = os.path.join(drive_dir, 'data')

Mounted at /content/drive


# Politicians' universities dataset

## Queries:
For university list:
```
SELECT ?university ?universityLabel ?country ?countryLabel ?location ?locationLabel WHERE {
  {
    SELECT DISTINCT ?university ?country ?location WHERE {
      ?university wdt:P31/wdt:P279* wd:Q3918;
                  wdt:P17 ?country;
                  wdt:P131 ?location.
    #FILTER EXISTS { ?country wdt:P361* wd:Q46. }  # Only European countries
    }
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
```

For politicians list:
```
SELECT ?name ?nameLabel ?university ?universityLabel
WHERE {
  {
  SELECT ?name ?university WHERE {
    ?name wdt:P106 wd:Q82955;
          wdt:P69 ?university.
    FILTER EXISTS { ?university wdt:P31 wd:Q3918. }    # University must be a university.
  }
  LIMIT 25000
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
```

In [None]:
import pandas as pd

def extract_entity_id_(x):
  """
    Since the values are usually given as a URL e.g. https://www.wikidata.org/wiki/Q756617,
    we remove the URL part and only keep "Q756617".
    If the value is NaN, we keep it as that.
  """
  if isinstance(x, float):  # x is NaN
    return x
  if x.startswith('http'):
    return x.split('/')[-1]
  return x

def extract_entity_ids(df):
  df_out = pd.DataFrame()
  for key in df.keys():
    if not "Label" in key:
      df_out[key] = df[key].map(extract_entity_id_)
    else:
      df_out[key] = df[key]
  return df_out


politicians = pd.read_csv(os.path.join(data_dir, 'PoliticiansUniversity.csv'))
universities = pd.read_csv(os.path.join(data_dir, 'Universities global.csv'))
politicians = politicians.drop(labels=['country', 'countryLabel'], axis=1).drop_duplicates()
politicians = extract_entity_ids(politicians)
universities = extract_entity_ids(universities)

In [None]:
import numpy as np

n_added_alternatives = 5

covered_universities = set(universities['university'])
out_list = []

for pol in politicians['name'].unique():
  name_str = politicians[politicians['name'] == pol]['nameLabel'].iloc[0]

  # Filter out all educational institutions not in the university database
  education = set(politicians[politicians['name'] == pol]['educated'])
  education = education.intersection(covered_universities)

  if len(education) == 0:
    # If they did not go to university, skip
    continue

  # Find alternative universities in the countries they studied in.
  alternative_universities = set()
  for e in education:
    uni_df = universities[universities['university'] == e]
    country = uni_df['country'].iloc[0]
    alternative_universities = alternative_universities.union(set(universities[universities['country'] == country]['university']))
  alternative_universities = alternative_universities.difference(education)
  alternative_universities = list(alternative_universities)

  n_alternatives = len(alternative_universities)
  if n_alternatives == 0:
    # If we cannot add alternative universities, skip
    continue

  # Add real universities to df
  for e in education:
    e_df = universities[universities['university'] == e]
    university_name_str = e_df['universityLabel'].iloc[0]
    country_name_str = e_df['countryLabel'].iloc[0]
    out_list.append([pol, name_str, e, university_name_str, country_name_str, 1])

  # Add random fake universities to df
  s = n_added_alternatives if n_added_alternatives < n_alternatives else n_alternatives
  for i in np.random.choice(np.arange(n_alternatives), size=s, replace=False):
    e = alternative_universities[i]
    e_df = universities[universities['university'] == e]
    university_name_str = e_df['universityLabel'].iloc[0]
    country_name_str = e_df['countryLabel'].iloc[0]
    out_list.append([pol, name_str, e, university_name_str, country_name_str, 0])

out_df = pd.DataFrame(out_list, columns=['name','nameLabel','education','educationLabel', 'countryLabel', 'isTrue'])

In [None]:
def clean_labels(df):
  acc = None
  for key in df.keys():
    if 'Label' in key:
      if acc is None:
        acc = ~df[key].str.contains('^Q[0-9]*$')
      else:
        acc &= ~df[key].str.contains('^Q[0-9]*$')
  return df[acc]

out_df = clean_labels(out_df)

In [None]:
out_df['Questions'] = out_df.apply(lambda x: f"Did {x.nameLabel} attend {x.educationLabel}?", axis=1)
out_df['Statements'] = out_df.apply(lambda x: f"{x.nameLabel} attended {x.educationLabel}.", axis=1)

In [None]:
out_df.to_csv(os.path.join(data_dir, 'politicians_edge_substitutions.csv'))
out_df

Unnamed: 0,name,nameLabel,education,educationLabel,countryLabel,isTrue,Questions,Statements
0,Q76,Barack Obama,Q13371,Harvard University,United States of America,1,Did Barack Obama attend Harvard University?,Barack Obama attended Harvard University.
1,Q76,Barack Obama,Q1346110,Occidental College,United States of America,1,Did Barack Obama attend Occidental College?,Barack Obama attended Occidental College.
2,Q76,Barack Obama,Q49088,Columbia University,United States of America,1,Did Barack Obama attend Columbia University?,Barack Obama attended Columbia University.
3,Q76,Barack Obama,Q1107305,Coker University,United States of America,0,Did Barack Obama attend Coker University?,Barack Obama attended Coker University.
4,Q76,Barack Obama,Q1814758,Virginia Union University,United States of America,0,Did Barack Obama attend Virginia Union Univers...,Barack Obama attended Virginia Union University.
...,...,...,...,...,...,...,...,...
42339,Q2262730,Sebastian Gemkow,Q122453,University of Stuttgart,Germany,0,Did Sebastian Gemkow attend University of Stut...,Sebastian Gemkow attended University of Stuttg...
42340,Q2262730,Sebastian Gemkow,Q896546,University of Applied Sciences Trier,Germany,0,Did Sebastian Gemkow attend University of Appl...,Sebastian Gemkow attended University of Applie...
42341,Q2262730,Sebastian Gemkow,Q1011953,Burg Giebichenstein University of Art and Desi...,Germany,0,Did Sebastian Gemkow attend Burg Giebichenstei...,Sebastian Gemkow attended Burg Giebichenstein ...
42342,Q2262730,Sebastian Gemkow,Q1622214,University of Applied Sciences for Police Bade...,Germany,0,Did Sebastian Gemkow attend University of Appl...,Sebastian Gemkow attended University of Applie...


# Cities dataset
Just use the one from the Tegmark paper.