In [99]:
import requests as rq
from urllib.parse import urljoin
import pandas as pd
import tqdm
from time import sleep


In [4]:
URL = "https://www.filmweb.pl"

In [116]:
class FilmwebClient:
    def __init__(self, server) -> None:
        self.server = server
        self.common_headers = {'x-locale': 'en'}
        self.users = []
        self.critics = []
        self.movies = []


    def search_movies(self, page: int):
        url = urljoin(self.server, f'/api/v1/films/search?page={page}')
        return rq.get(url, headers=self.common_headers)


    def get_critics(self):
        url = urljoin(self.server, '/api/v1/film/critics')
        return rq.get(url, headers=self.common_headers)
    

    def get_movie_rating(self, movie_id):
        url = urljoin(self.server, f'/api/v1/film/{movie_id}/rating')
        return rq.get(url, headers=self.common_headers)
    

    def get_movie_critics(self, movie_id):
        url = urljoin(self.server, f'/api/v1/film/{movie_id}/critics')
        return rq.get(url, headers=self.common_headers)
    

    def get_movie_review(self, movie_id, critic_username):
        url = urljoin(self.server, f'/api/v1/user/{critic_username}/vote/film/{movie_id}')
        return rq.get(url, headers=self.common_headers)

    def get_reviewed_movies(self, critic_username):
        url = urljoin(self.server, f'/api/v1/user/{critic_username}/vote/film/')
        return rq.get(url, headers=self.common_headers)
    

In [117]:
filmweb = FilmwebClient(URL)

# Get movies

In [66]:
def generator():
  while True:
    yield


page = 1
movies = []
for _ in tqdm.tqdm(generator()):
    response = filmweb.search_movies(page)
    if response.status_code != 200:
        print(f'Reached the end of movies list, last page: {page-1}, number of movies: {len(movies)}')
        break
    try:
        movies += response.json()['searchHits']
    except:
        print(f'Cannot parse movies from page {page}.')
    page+=1

463it [03:37,  2.13it/s]


ConnectTimeout: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/films/search?page=464 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2189dc610>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))

Don't worry about above problem as long as the number of pages is reasonable.

In [67]:
df_movies = pd.DataFrame(movies)
df_movies.head()

Unnamed: 0,id,type,filmMainCast
0,862,film,"[{'id': 124, 'name': 'Tom Hanks'}, {'id': 3454..."
1,1048,film,"[{'id': 90, 'name': 'Tim Robbins'}, {'id': 176..."
2,998,film,"[{'id': 124, 'name': 'Tom Hanks'}, {'id': 5216..."
3,671,film,"[{'id': 88, 'name': 'Jean Reno'}, {'id': 76, '..."
4,9136,film,"[{'id': 3366, 'name': 'Ellen Burstyn'}, {'id':..."


In [68]:
df_movies.count()

id              4630
type            4630
filmMainCast    4618
dtype: int64

Save so far collected data, just in case

In [69]:
df_movies[['id', 'type']].to_csv('intermidate/movies.csv', index=False)

# Get critics

In [74]:
df_critics = pd.DataFrame(filmweb.get_critics().json())
df_critics.head()

Unnamed: 0,id,userName,priority,publisher
0,1164541,michaloleszczyk,1,Spoiler Master Podcast
1,1243927,tru3,1,Magazyn SFP
2,1418184,Cateborough,1,POPcast - podcast o popkulturze
3,1589044,darek_arest,1,krytyk niezależny
4,3281021,smolinskisebastian,1,krytyk niezależny


In [75]:
df_critics.groupby('priority').count()

Unnamed: 0_level_0,id,userName,publisher
priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,9,9,9
2,10,10,10
3,13,13,13
4,17,17,17
5,7,7,7
6,10,10,10
7,1,1,1


In [76]:
df_critics.count()

id           67
userName     67
priority     67
publisher    67
dtype: int64

In [77]:
df_critics.to_csv('intermidate/critics.csv')

# Dataset preparation
Iterate over critics and get movies which they reviewed

In [144]:
results = []
critics = df_critics['userName'].values.tolist()

In [145]:
it = 0
while len(results) < len(df_critics['userName']):
    for critic in tqdm.tqdm(critics[it:]):
        try:
            movies_ids = list(map(lambda response: {'username': critic, 'movie_id': response[0]}, filmweb.get_reviewed_movies(critic).json()))
            results += movies_ids
        except Exception as e:
            print(f'Exception when getting {critic}\'s reviews. {e}')
            break
        it+=1

100%|██████████| 67/67 [00:10<00:00,  6.37it/s]


In [147]:
df_critics_reviews = pd.DataFrame(results)
df_critics_reviews.head()

Unnamed: 0,username,movie_id
0,michaloleszczyk,10014103
1,michaloleszczyk,10025735
2,michaloleszczyk,805204
3,michaloleszczyk,877700
4,michaloleszczyk,10026692


In [169]:
for i in tqdm.tqdm(df_critics_reviews.values[2000:]):
    pass

100%|██████████| 4324/4324 [00:00<00:00, 3106040.50it/s]


In [170]:
it = 0
results = []
while len(results) < len(df_critics_reviews.values) or it >= len(df_critics_reviews.values):
    for case in tqdm.tqdm(df_critics_reviews.values[it:]):
        try:
            response = filmweb.get_movie_review(critic_username=case[0], movie_id=case[1]).json()
            if 'comment' in response:
                movie_rating = filmweb.get_movie_rating(case[1]).json()
                results.append({'username': case[0], 
                                'movie_id': case[1],
                                'user_rate': response['rate'],
                                'user_comment': response['comment'],
                                'movie_rating': movie_rating['rate'],
                                'movie_rating_count': movie_rating['count']
                                })
        except Exception as e:
            print(f'Fetching data about movie rating failed: {e}')
            break
        it+=1

  3%|▎         | 165/6324 [03:00<1:52:10,  1.09s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/tru3/vote/film/10019460 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157464a0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  5%|▍         | 295/6159 [04:09<1:22:46,  1.18it/s] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/164870/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745db0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  3%|▎         | 176/5864 [02:45<1:29:21,  1.06it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10004934/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745b10>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  5%|▌         | 312/5688 [04:22<1:15:16,  1.19it/s] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10004934/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745b70>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|          | 39/5376 [02:19<5:17:37,  3.57s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/754800/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215746440>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  9%|▉         | 501/5337 [04:10<40:15,  2.00it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/sergiuszowo11/vote/film/10013518 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745f30>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  7%|▋         | 316/4836 [04:34<1:05:25,  1.15it/s] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Jakub_Demianczuk/vote/film/36469 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744b50>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  3%|▎         | 151/4520 [02:56<1:24:55,  1.17s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10020414/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745c90>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|▏         | 65/4369 [02:25<2:40:27,  2.24s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Arwen/vote/film/877594 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157458d0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|          | 22/4304 [02:15<7:18:27,  6.14s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/GrzegorzLaguna/vote/film/120749 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157456c0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  8%|▊         | 350/4282 [04:28<50:13,  1.30it/s]   


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/527837/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745120>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  9%|▊         | 336/3932 [03:32<37:59,  1.58it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10008680/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744df0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  4%|▍         | 157/3596 [03:57<1:26:43,  1.51s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/martabalaga/vote/film/829452 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744bb0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  4%|▎         | 128/3439 [03:39<1:34:28,  1.71s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Cosimo_Villa_Nova/vote/film/878157 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745750>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  6%|▌         | 190/3311 [03:26<56:37,  1.09s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/David_Frost/vote/film/10015096 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744a90>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  2%|▏         | 49/3121 [02:21<2:27:38,  2.88s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/David_Frost/vote/film/671049 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745f00>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  6%|▌         | 190/3072 [04:03<1:01:36,  1.28s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/bkkb/vote/film/710056 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157463e0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 10%|▉         | 286/2882 [04:18<39:02,  1.11it/s]   


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10007169/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157457b0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  4%|▎         | 93/2596 [02:41<1:12:34,  1.74s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/blizej_ekranu/vote/film/8649 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744460>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 13%|█▎        | 333/2503 [03:59<26:01,  1.39it/s]  


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Maciej_Niedzwiedzki/vote/film/10013458 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157459f0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 10%|▉         | 207/2170 [03:39<34:38,  1.06s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/830546/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744ac0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|          | 10/1963 [02:13<7:13:17, 13.31s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/piogus/vote/film/842133 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744b20>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  8%|▊         | 147/1953 [03:00<36:54,  1.23s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/826447/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215746290>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  5%|▌         | 91/1806 [02:39<50:10,  1.76s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/520943/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745330>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|▏         | 22/1715 [02:17<2:55:58,  6.24s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Muszynski/vote/film/1033 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744df0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  7%|▋         | 113/1693 [02:41<37:42,  1.43s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Marcin_P/vote/film/10012008 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745d80>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 15%|█▌        | 239/1580 [04:07<23:10,  1.04s/it]  


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/jultacz/vote/film/10014103 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157452d0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 28%|██▊       | 374/1341 [03:39<09:26,  1.71it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/847034/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157462f0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  6%|▌         | 59/967 [02:27<37:49,  2.50s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/santu/vote/film/857962 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157463b0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 32%|███▏      | 292/908 [04:07<08:42,  1.18it/s]  


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10039003/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745330>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 32%|███▏      | 198/616 [03:14<06:50,  1.02it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10019146/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744eb0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 53%|█████▎    | 223/418 [04:24<03:51,  1.18s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/MCh092788/vote/film/695409 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744f70>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 18%|█▊        | 36/195 [02:17<10:06,  3.81s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/MCh092788/vote/film/31672 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157456c0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


100%|██████████| 159/159 [00:40<00:00,  3.90it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


KeyboardInterrupt: 

In [171]:
ds_df = pd.DataFrame(results)
ds_df.head()

Unnamed: 0,username,movie_id,user_rate,user_comment,movie_rating,movie_rating_count
0,michaloleszczyk,10014103,8,Willa Hössa jako makieta najciemniejszej stron...,8.32095,296
1,michaloleszczyk,10025735,3,"Zgentryfikowana ""Mama i dziwka"" o smaku pumpki...",6.41777,833
2,michaloleszczyk,805204,9,"""Ile wilków widzisz na obrazku?"" A ile widzisz...",7.41684,9884
3,michaloleszczyk,10034333,9,Film bliski perfekcji; czysty i syty jak ronde...,7.36086,557
4,michaloleszczyk,10015490,9,Wielka niespodzianka i najbardziej oryginalny ...,5.44533,375


In [172]:
ds_df.count()

username              3057
movie_id              3057
user_rate             3057
user_comment          3057
movie_rating          3057
movie_rating_count    3057
dtype: int64

In [179]:
ds_df.to_json('../dataset/dataset.json')

In [180]:
ds_df.to_csv('../dataset/dataset.csv', index=False, sep='$', encoding='utf8')

TODO: fetch tv series analogously to movies
DO NOT MERGE THE DATASETS! We may consider if we want to train model only on movies, tv series or both
# TV series