# Mix'n'match Mismatch Generation

This notebook is used to genrate mismatches for [Mismatch Finder](https://www.wikidata.org/wiki/Wikidata:Mismatch_Finder) via a request to [Mix'n'match](https://meta.wikimedia.org/wiki/Mix%27n%27match) data stores. Data will be formatted for upload given the [directions for creating a mismatch file](https://github.com/wmde/wikidata-mismatch-finder/blob/main/docs/UserGuide.md#creating-a-mismatches-import-file).

In [1]:
#!pip install jupyter-black
#!pip install tensorflow
#!pip install aiohttp

In [2]:
# %load_ext jupyter_black

In [3]:
import ast
import json
import sys
import urllib

import numpy as np
import pandas as pd

PATH_TO_UTILS = "../"  # change based on your directory structure
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting

## Get Data

In [4]:
mnm_mismatch_request_url = (
    "https://mix-n-match.toolforge.org/api.php?query=all_issues&mode=time_mismatch"
)

In [5]:
with urllib.request.urlopen(mnm_mismatch_request_url) as url:
    mnm_mismatch_data = json.load(url)

In [6]:
print(f"{len(mnm_mismatch_data['data']):,}")

82,996


In [7]:
mnm_mismatch_data["data"][:2]

[{'issue_id': '85584',
  'entry_id': '44032422',
  'time_mismatch': {'prop': 'P569',
   'wd_time': '+1925-01-01T00:00:00Z',
   'mnm_time': '+1926-07-04T00:00:00Z',
   'q': 'Q329124'}},
 {'issue_id': '564195',
  'entry_id': '115714460',
  'time_mismatch': {'prop': 'P569',
   'wd_time': '+1998-09-19T00:00:00Z',
   'mnm_time': '+1987-04-17T00:00:00Z',
   'q': 'Q107654539'}}]

In [8]:
mnm_mismatch_data_expanded = []
for d in mnm_mismatch_data["data"]:
    d["source"] = f"https://mix-n-match.toolforge.org/#/entry/{d['entry_id']}"
    d.pop("issue_id", None)
    d["time_mismatch"]["pid"] = d["time_mismatch"].pop("prop")
    d["time_mismatch"]["qid"] = d["time_mismatch"].pop("q")
    d["item_id"] = d["time_mismatch"]["qid"]

    mnm_mismatch_data_expanded.append(d)

In [9]:
mnm_mismatch_data_expanded[:10]

[{'entry_id': '44032422',
  'time_mismatch': {'wd_time': '+1925-01-01T00:00:00Z',
   'mnm_time': '+1926-07-04T00:00:00Z',
   'pid': 'P569',
   'qid': 'Q329124'},
  'source': 'https://mix-n-match.toolforge.org/#/entry/44032422',
  'item_id': 'Q329124'},
 {'entry_id': '115714460',
  'time_mismatch': {'wd_time': '+1998-09-19T00:00:00Z',
   'mnm_time': '+1987-04-17T00:00:00Z',
   'pid': 'P569',
   'qid': 'Q107654539'},
  'source': 'https://mix-n-match.toolforge.org/#/entry/115714460',
  'item_id': 'Q107654539'},
 {'entry_id': '29738972',
  'time_mismatch': {'wd_time': '+1866-01-01T00:00:00Z',
   'mnm_time': '+1867-09-28T00:00:00Z',
   'pid': 'P569',
   'qid': 'Q4710535'},
  'source': 'https://mix-n-match.toolforge.org/#/entry/29738972',
  'item_id': 'Q4710535'},
 {'entry_id': '82883700',
  'time_mismatch': {'wd_time': '+1932-09-12T00:00:00Z',
   'mnm_time': '+1923-09-12T00:00:00Z',
   'pid': 'P569',
   'qid': 'Q9355023'},
  'source': 'https://mix-n-match.toolforge.org/#/entry/82883700',
  

In [10]:
mnm_mismatch_data_expanded = list(filter(lambda d: d["time_mismatch"]["wd_time"] != d["time_mismatch"]["mnm_time"], mnm_mismatch_data_expanded))
len(mnm_mismatch_data_expanded)

82996

In [11]:
import pandas as pd
from numpy import NAN
from tqdm import tqdm

In [None]:
acc = []
for entry in tqdm(mnm_mismatch_data_expanded):
    data = entry["time_mismatch"]
    req = f'https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/{entry["item_id"]}?_fields=statements'
    try:
        with urllib.request.urlopen(req) as url:
            wd_props = json.load(url)["statements"]
    except urllib.request.HTTPError as e:
        # Fixed in newer version https://stackoverflow.com/questions/67723860/python-urllib-request-urlopen-http-error-308-permanent-redirect
        print("Skipped", req)
        print(e)
        continue
    
    if "P1220" not in wd_props:  # https://www.wikidata.org/wiki/Property:P1220
        # No mix'n'match id, meaning it is automatic, not manual match & should be skipped
        continue

    with urllib.request.urlopen(f'https://mix-n-match.toolforge.org/api.php?query=get_entry&entry={entry["entry_id"]}') as url:
        ext_url = json.load(url)["data"]["entries"][entry["entry_id"]]["ext_url"]
    
    acc.append({
        "item_id": entry["item_id"],
        "statement_guid": wd_props[data["pid"]][0]["id"],
        "property_id": data["pid"],
        "wikidata_value": data["wd_time"],
        "meta_wikidata_value": NAN,
        "external_value": data["mnm_time"],
        "external_url": ext_url,
        "type": "statement",
    })

In [12]:
import nest_asyncio
nest_asyncio.apply()

In [13]:
import asyncio
import aiohttp
import ssl

acc = []

async def fetch(session, entry):
    data = entry["time_mismatch"]
    req = f'https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/{entry["item_id"]}?_fields=statements'
    async with session.get(req, ssl=ssl.SSLContext()) as response:
        wd_props = (await response.json())["statements"]
        pbar.update(0.5)
        
        if "P1220" not in wd_props:  # https://www.wikidata.org/wiki/Property:P1220
            # No mix'n'match id, meaning it is automatic, not manual match & should be skipped
            pbar.update(0.5)
            return False

        url = f'https://mix-n-match.toolforge.org/api.php?query=get_entry&entry={entry["entry_id"]}'
        async with session.get(url, ssl=ssl.SSLContext()) as response:
            ext_url = (await response.json())["data"]["entries"][entry["entry_id"]]["ext_url"]
            pbar.update(0.5)
        
            acc.push({
                "item_id": entry["item_id"],
                "statement_guid": wd_props[data["pid"]][0]["id"],
                "property_id": data["pid"],
                "wikidata_value": data["wd_time"],
                "meta_wikidata_value": NAN,
                "external_value": data["mnm_time"],
                "external_url": ext_url,
                "type": "statement",
            })
            return True


async def fetch_all(urls, loop):
    async with aiohttp.ClientSession(loop=loop, read_timeout=None) as session:
        results = await asyncio.gather(*[fetch(session, url) for url in urls], return_exceptions=True)
        return results


if __name__ == '__main__':
    pbar = tqdm(total=len(mnm_mismatch_data_expanded))
    loop = asyncio.get_event_loop()
    status = loop.run_until_complete(fetch_all(mnm_mismatch_data_expanded, loop))
    pbar.close()


  async with session.get(req, ssl=ssl.SSLContext()) as response:
  async with session.get(req, ssl=ssl.SSLContext()) as response:
  async with session.get(url, ssl=ssl.SSLContext()) as response:
  async with session.get(url, ssl=ssl.SSLContext()) as response:
  8%|▊         | 6556.0/82996 [05:41<1:06:27, 19.17it/s]


In [25]:
nn = list(filter(lambda x: x is not None, acc))
nn[2]

ContentTypeError(RequestInfo(url=URL('https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/Q112012?_fields=statements'), method='GET', headers=<CIMultiDictProxy('Host': 'www.wikidata.org', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Python/3.11 aiohttp/3.8.6')>, real_url=URL('https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/Q112012?_fields=statements')), (), message='Attempt to decode JSON with unexpected mimetype: text/html; charset=utf-8', headers=<CIMultiDictProxy('Date': 'Thu, 15 Feb 2024 17:43:36 GMT', 'Server': 'Varnish', 'x-cache': 'cp1104 int', 'x-cache-status': 'int-front', 'server-timing': 'cache;desc="int-front", host;desc="cp1104"', 'strict-transport-security': 'max-age=106384710; includeSubDomains; preload', 'report-to': '{ "group": "wm_nel", "max_age": 604800, "endpoints": [{ "url": "https://intake-logging.wikimedia.org/v1/events?stream=w3c.reportingapi.network_error&schema_uri=/w3c/reportingapi/network_error/1.0.0" }] }', '

In [None]:
mismatchDF = pd.DataFrame(acc)
mismatchDF

In [None]:
check_mf_formatting(mismatchDF)
len(mismatchDF)

In [None]:
mismatchDF.to_csv("first-4327-mismatches.csv")