In [235]:
import io
from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir
from collections import defaultdict
import re
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import requests

def read_item_names():
    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = (line[1], line[2])
    return rid_to_name


def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, round(est, 3)))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

user_id = "6"

data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 5}
algo = KNNBaseline(k=4, sim_options=sim_options)
algo.fit(trainset)

testset = trainset.build_anti_testset()
testset = filter(lambda x: x[0] == user_id, testset)
predictions = algo.test(testset)

top_n = get_top_n(predictions)
rid_to_name = read_item_names()

print('User ' + user_id)
films_list = []
for movie_rid, rating in top_n[user_id]:
    films_list.append(rid_to_name[movie_rid][0][:-6])
    print('{:4s} {:70s} {}'.format(movie_rid, str(rid_to_name[movie_rid]), rating))
    
films_list

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
User 6
814  ('Great Day in Harlem, A (1994)', '01-Jan-1994')                       5
1536 ('Aiqing wansui (1994)', '22-Jul-1996')                                5
1512 ('World of Apu, The (Apur Sansar) (1959)', '05-Apr-1996')              4.948
1599 ("Someone Else's America (1995)", '10-May-1996')                       4.884
1500 ('Santa with Muscles (1996)', '08-Nov-1996')                           4.882


['Great Day in Harlem, A ',
 'Aiqing wansui ',
 'World of Apu, The (Apur Sansar) ',
 "Someone Else's America ",
 'Santa with Muscles ']

In [236]:
#правильное написание фильмов для поиска в wikidata
for i in range(len(films_list)):
    if (',' in films_list[i]):
        temp=films_list[i].split(',')
        films_list[i]=temp[1]+temp[0]
    if (re.search(r'\((.*?)\)',films_list[i])):
        films_list[i] = re.search(r'\((.*?)\)',films_list[i])
        films_list[i] = (films_list[i].group(0)).replace('(', '').replace(')', '')
    films_list[i]=films_list[i].strip()

films_list

['A Great Day in Harlem',
 'Aiqing wansui',
 'Apur Sansar',
 "Someone Else's America",
 'Santa with Muscles']

In [237]:
API_ENDPOINT = "https://www.wikidata.org/w/api.php"

In [238]:
query_films=[]

for query in films_list:
    params = {
        'action' : 'wbsearchentities',
        'format' : 'json',
        'language' : 'en',
        'search': query
    }
    res = requests.get(API_ENDPOINT, params = params)
    for film in res.json()['search']:
        if ('description' in film):
            if ('film' in film['description']):
                query_films.append((film['id'], query))
                
query_films

[('Q4657171', 'A Great Day in Harlem'),
 ('Q622376', 'Apur Sansar'),
 ('Q7219297', "Someone Else's America"),
 ('Q1631700', 'Santa with Muscles')]

In [240]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

for film in query_films:
    spaqrql_query = """
    SELECT DISTINCT ?film ?filmLabel 
    WHERE 
    {
      ?film wdt:P31 wd:Q11424.
      wd:"""+film[0]+""" wdt:P136 ?genre.
      wd:"""+film[0]+""" wdt:P272 ?production_company.
      ?film wdt:P136 ?genre1.
      ?film wdt:P272 ?production_company.
      FILTER(?film != wd:"""+film[0]+""").
      FILTER(?genre1 NOT IN (?genre)).
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
    LIMIT 10
    """

    sparql.setQuery(spaqrql_query)

    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results_df = pd.io.json.json_normalize(results['results']['bindings'])
    if len(results_df.columns) <= 0:
        print(film[1], " : nothing similar found")
    else:
        print(film[1])
        print(results_df[['film.value', 'filmLabel.value']].head())

A Great Day in Harlem  : nothing similar found
Apur Sansar  : nothing similar found
Someone Else's America  : nothing similar found
Santa with Muscles  : nothing similar found
