# Report of Programming Challenge

My custom class `Wikidata` is attached at the bottom, also you can find it on this [link](https://github.com/JiashengWu/Wikidata/blob/master/Wikidata.py).

In [1]:
from Wikidata import Wikidata

wikidata = Wikidata(r'programming_challenge/P*.txt')

In [2]:
import time, os

def test(relpath):
    time_start = time.time()
    strings = Wikidata.file_to_list(relpath)
    scores = wikidata.find_property(strings)
    time_used = time.time() - time_start
    print('FILE: {}'.format(os.path.basename(relpath)))
    print('TIME: {:.3f} s'.format(time_used))
    for score in scores[0: 10]:
        print(score)
    if (len(scores) > 10):
        print('...')

## Tests

In [7]:
test('programming_challenge/sample1.txt')

FILE: sample1.txt
TIME: 0.047 s
('P882_FIPS 6-4 (US counties).txt', 146)
('P374_INSEE municipality code.txt', 124)
('P830_Encyclopedia of Life ID.txt', 115)
('P3064_LepIndex ID.txt', 109)
('P6018_SeaLifeBase ID.txt', 108)
('P354_HGNC ID.txt', 106)
('P4129_Cinema Treasures ID.txt', 104)
('P5736_Minor Planet Center body ID.txt', 97)
('P815_ITIS TSN.txt', 97)
('P3151_iNaturalist taxon ID.txt', 94)
...


In [8]:
test('programming_challenge/sample2.txt')

FILE: sample2.txt
TIME: 0.046 s
('P3417_Quora topic ID.txt', 144)
('P4024_ADW taxon ID.txt', 2)
('P5242_ARWU university ID.txt', 1)
('P3984_subreddit.txt', 1)
('P3847_Open Library subject ID.txt', 0)
('P6063_Scoresway basketball person ID.txt', 0)
('P2518_Scope.dk film ID.txt', 0)
('P2387_Elonet person ID.txt', 0)
('P4358_Dutch lost building register ID.txt', 0)
('P6071_Polish Olympic Committee athlete ID.txt', 0)
...


In [9]:
test('programming_challenge/sample3.txt')

FILE: sample3.txt
TIME: 0.009 s
('P882_FIPS 6-4 (US counties).txt', 9)
('P830_Encyclopedia of Life ID.txt', 8)
('P3064_LepIndex ID.txt', 7)
('P374_INSEE municipality code.txt', 7)
('P354_HGNC ID.txt', 7)
('P351_Entrez Gene ID.txt', 7)
('P3143_elFilm film ID.txt', 7)
('P772_INE municipality code.txt', 7)
('P5736_Minor Planet Center body ID.txt', 6)
('P4129_Cinema Treasures ID.txt', 6)
...


In [10]:
test('programming_challenge/sample4.txt')

FILE: sample4.txt
TIME: 0.010 s
('P3417_Quora topic ID.txt', 9)
('P5242_ARWU university ID.txt', 1)
('P3847_Open Library subject ID.txt', 0)
('P6063_Scoresway basketball person ID.txt', 0)
('P2518_Scope.dk film ID.txt', 0)
('P2387_Elonet person ID.txt', 0)
('P4358_Dutch lost building register ID.txt', 0)
('P6071_Polish Olympic Committee athlete ID.txt', 0)
('P4605_South Dakota Sports Hall of Fame ID.txt', 0)
('P2871_miRBase mature miRNA ID.txt', 0)
...


## class `Wikidata`

In [None]:
import glob
import os


class Wikidata:
    """Wikidata class, which provide a function of find_property().

    Attributes:
        paths (list): The file paths of the Wikidata property files.
        data (dict): Data stored as {path: example_set}.
    """

    def __init__(self, relpath):
        """The constructor of Wikidata class.
        Read the input files only once, and store them in class attribute 'data'.

        Parameters:
            relpath (string): The relative path of the Wikidata property files.
        """
        self.paths = glob.glob(relpath)
        self.data = {}
        for path in self.paths:
            self.data[path] = set(self.file_to_list(path))

    def find_property(self, strings):
        """Find property score (the number of strings in the input that are contained in a file).

        Parameters:
            strings (list): The input list of strings.
        Returns:
            scores (list): A sorted list of tuples of the form (filename, score).
        """
        scores = []
        for path, examples in self.data.items():
            score = 0
            for string in strings:
                if string in examples:
                    score += 1
            scores.append((os.path.basename(path), score))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        return scores

    @ staticmethod
    def file_to_list(relpath):
        """Convert a Wikidata property file to a list of strings.

        Parameters:
            relpath (string): The relative path of ONE Wikidata property file.
        Returns:
            examples (list): A list contains all examples in the input file.
        """
        examples = []
        with open(relpath, mode='r') as property:  # Wikidata property
            for line in property:
                examples.append(line.strip())
        return examples


if __name__ == '__main__':
    wikidata = Wikidata(r'programming_challenge/P*.txt')
    strings = Wikidata.file_to_list('programming_challenge/sample1.txt')
    scores = wikidata.find_property(strings)
    print(scores)
