In [162]:
import os
import glob
import pickle
import numpy as np
import ujson as json
from tqdm.notebook import tqdm
from collections import defaultdict, Counter
from typing import Iterable, List
from gensim.test.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


def process_corpus(author_fn, tokens_only=False) -> Iterable[List[str]]:
    with open(author_fn, 'r') as f:
        data = json.load(f)
        for author, author_data in data.items():
            wiki_note = author_data['note']
            for i, line in enumerate(wiki_note.split("\n")):
                tokens = simple_preprocess(line)
                if tokens_only:
                    yield tokens
                else:
                    # For training data, add tags
                    yield TaggedDocument(tokens, [i])


def iterable_corpus(train_docs: List[str], tags: List[str] = None, text_mode=True):
    for i, doc in enumerate(train_docs):
        tokens = simple_preprocess(doc)
        if text_mode:
            # use the text similarity
            yield TaggedDocument(tokens, [i])
        elif tags is not None:
            # use the tag similarity
            yield TaggedDocument(tokens, tags[i])
        else:
            yield tokens


CODED_TAGS = ['genre', 'style', 'material']


def read_artwork_desc(src_folder: os.PathLike):
    """
    Read the available descriptions
    """
    description_corpus = defaultdict(list)
    for author in tqdm(glob.glob(src_folder)):
        author_json = json.load(open(author, 'r'))
        for work in author_json:
            desc = work.get("description", None)
            if desc:

                tags = work.get("tags", None)
                ensemble_tags = []
                if tags:
                    tags = tags.lower().split(', ')
                    ensemble_tags.extend(tags)
                for subtag in CODED_TAGS:
                    st = work.get(subtag, None)
                    if st:
                        st = st.lower().strip()
                        if ',' in st:
                            st = st.split(', ')
                            ensemble_tags.extend(st)
                        else:
                            ensemble_tags.append(st)
                if not ensemble_tags:
                    continue
                description_corpus['tags'].append(ensemble_tags)
                description_corpus['author'].append(
                    work.get("artistName", "Unknown"))
                description_corpus['description'].append(
                    desc.replace("[/i]", "").replace("[i]", ""))
                description_corpus['title'].append(
                    work.get("title", "Untitles"))
    print("Gathered", len(description_corpus['tags']))
    pickle.dump(description_corpus, open('description.pkl', 'wb'))
    return description_corpus


d = read_artwork_desc(src_folder='/Users/jm/data/wikiart/meta/*.json')


  0%|          | 0/2042 [00:00<?, ?it/s]

Gathered 2649


In [None]:
TEXT_MODE = False

rng = np.random.default_rng(seed=42)
artwork_descriptions = pickle.load(open('description.pkl', 'rb'))
indices = np.arange(0, len(artwork_descriptions['description']), 1)
ratio = 0.8
train_indices = rng.choice(indices, size=int(
    ratio*len(indices)), replace=False)
test_indices = indices[~np.isin(indices, train_indices)]
all_docs = np.asarray(artwork_descriptions['description'])
all_tags = np.asarray(artwork_descriptions['tags'], dtype=object)
print(f"All docs: {all_docs.shape}")
train_docs = list(iterable_corpus(
    all_docs[train_indices], tags=all_tags[train_indices], text_mode=TEXT_MODE))
test_docs = list(iterable_corpus(all_docs[test_indices], text_mode=False))
model = Doc2Vec(vector_size=100, min_count=1, epochs=50)
model.build_vocab(train_docs)
model.train(train_docs, total_examples=model.corpus_count, epochs=model.epochs)


In [154]:
ranks = []
# some ranks are a miss, but there are plenty of the that did not hit
for doc_id in range(len(train_docs)):
    tags = train_docs[doc_id].tags
    inferred_vector = model.infer_vector(train_docs[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    ranking = [docid for docid, _ in sims]
    best_rank = min([len(ranking)] + [ranking.index(tag) +
                    1 for tag in tags if tag in ranking])
    ranks.append(best_rank)
ranker = Counter(ranks)
print("Top ranks", ranker.most_common(10))


Top ranks [(1, 768), (2, 53), (3, 41), (4, 24), (7, 24), (5, 24), (23, 22), (8, 21), (24, 20), (17, 19)]


In [161]:
doc_id = rng.choice(range(len(test_docs)))
if TEXT_MODE:
    inferred_vector = model.infer_vector(test_docs[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    print('Test Document ({}): «{}»\n'.format(
        doc_id, ' '.join(test_docs[doc_id].words)))
    print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
    for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
        text = train_docs[sims[index][0]].words
        print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(text)))
else:
    inferred_vector = model.infer_vector(test_docs[doc_id])
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    # Compare and print the most/median/least similar documents from the train corpus
    print('Test Document ({}): «{}»\n'.format(
        doc_id, ' '.join(test_docs[doc_id])))
    print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
    for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
        print(u'%s %s: «%s»\n' %
              (label, sims[index], ' '.join(sims[index][0])))


Test Document (10): «this amphora is decorated on both sides but in different painting techniques one side has scene depicted in the red figure style and the other side shows the same scene in the black figure style this type of decoration puts the vase into the so called bilingual group the traditional attributions for the painter is the red figure side is by the andokides painter and the black figure side is by the lysippides painter this scene known from other representations in greek art depicts the heroes achilles and ajax playing board game the warriors wear their helmets and hold two spears each ajax has his right hand near the board ready to play when his turn comes both heroes wear short tunics chitoniskoi and are armed with corslet cushes greaves they wear short cloaks over their armour behind them their shields lean against something with their helmets perched on top behind them or beside them at arm reach both sit with the hither leg drawn back ajax is farther from the tabl

# Elasticsearch index

In [169]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import streaming_bulk
from typing import Dict, Any


def create_painting_index(indx: str, es: Elasticsearch, feature_dims: int = 2048,
                          text_dims: int = 50):
    dense_paintings = {
        "mappings": {
            "properties": {
                # "img_vector": {
                #     "type": "dense_vector",
                #     "dims": feature_dims
                # },
                # "text_vector": {
                #     "type": "dense_vector",
                #     "dims": text_dims
                # },
                "description": {
                    "type": "text"
                },
                "style": {
                    "type": "keyword"
                },
                "genre": {
                    "type": "keyword"
                },
                "material": {
                    "type": "keyword"
                },
                "title": {
                    "type": "text"
                },
                "author": {
                    "type": "keyword"
                },
                "url": {
                    "type": "text"
                }
            }
        }
    }
    es.indices.delete(indx, ignore=[400, 404])
    es.indices.create(index=indx, body=dense_paintings)


def stream_painting_docs(index: str, src_folder: os.PathLike) -> Iterable[Dict[str, Any]]:
    for author in tqdm(glob.glob(src_folder)):
        author_json = json.load(open(author, 'r'))
        for work in author_json:
            desc = work.get("description", None)
            if desc:
                tags = work.get("tags", None)
                extra_tags = {tag: work.get(tag, None) for tag in CODED_TAGS}
                yield {
                    "_index": index,
                    "_source": {
                        "url": work.get("image", None),
                        **extra_tags,
                        "tags": tags.split(","),
                        "description": work.get("description", None),
                        "author": work.get("artistName", "Unknown"),
                        "title": work.get("title", "Untitled")
                    }
                }


def create_author_index(indx: str, es: Elasticsearch, text_dims: int = 50):
    dense_paintings = {
        "mappings": {
            "properties": {
                "text_vector": {
                    "type": "dense_vector",
                    "dims": text_dims
                },
                "author": {
                    "type": "keyword"
                },
                "wiki": {
                    "type": "text"
                }
            }
        }
    }
    es.indices.delete(indx, ignore=[400, 404])
    es.indices.create(index=indx, body=dense_paintings)


def search_paintings(es: Elasticsearch, index: str, query: str, query_vector: List[float], ws: int, field_name: str):
    query = {
        "query": {
            "match": {
                "query": query
            }
        },
        "rescore": [
            {
                "window_size": ws,
                "query": {
                    "script_score": {
                        "query": {
                            "match_all": {}
                        },
                        "script": {
                            "source": f"cosineSimilarity(params.query_vector, '{field_name}') + 1.0",
                            "params": {
                                "query_vector": query_vector

                            }
                        }
                    }
                }
            }
        ]
    }
    results = es.search(index=index, body=query)
    return results['hits']['hits']


es = Elasticsearch()


In [170]:
create_painting_index("paintings", es)
for ok, response in stream_painting_docs(
        index="paintings", src_folder='/Users/jm/data/wikiart/meta/*.json'):
    if not ok:
        print(response)



  0%|          | 0/2042 [00:00<?, ?it/s]

TypeError: 'set' object is not a mapping