In [1]:
import requests as rq
from urllib.parse import urljoin
import pandas as pd
import tqdm

In [2]:
URL = "https://www.filmweb.pl"

In [3]:
class FilmwebClient:
    def __init__(self, server) -> None:
        self.server = server
        self.common_headers = {'x-locale': 'en'}
        self.users = []
        self.critics = []
        self.movies = []


    def search_movies(self, page: int):
        url = urljoin(self.server, f'/api/v1/films/search?page={page}')
        return rq.get(url, headers=self.common_headers)


    def get_critics(self):
        url = urljoin(self.server, '/api/v1/film/critics')
        return rq.get(url, headers=self.common_headers)
    

    def get_movie_rating(self, movie_id):
        url = urljoin(self.server, f'/api/v1/film/{movie_id}/rating')
        return rq.get(url, headers=self.common_headers)
    

    def get_movie_critics(self, movie_id):
        url = urljoin(self.server, f'/api/v1/film/{movie_id}/critics')
        return rq.get(url, headers=self.common_headers)
    

    def get_movie_review(self, movie_id, critic_username):
        url = urljoin(self.server, f'/api/v1/user/{critic_username}/vote/film/{movie_id}')
        return rq.get(url, headers=self.common_headers)

    def get_reviewed_movies(self, critic_username):
        url = urljoin(self.server, f'/api/v1/user/{critic_username}/vote/film/')
        return rq.get(url, headers=self.common_headers)
    
    def get_reviewed_servies(self, critic_username):
        url = urljoin(self.server, f'/api/v1/user/{critic_username}/vote/serial')
        return rq.get(url, headers=self.common_headers)
    
    def get_tv_series_review(self, series_id, critic_username):
        url = urljoin(self.server, f'/api/v1/user/{critic_username}/vote/serial/{series_id}')
        return rq.get(url, headers=self.common_headers)

In [4]:
filmweb = FilmwebClient(URL)

# Get movies

In [None]:
def generator():
  while True:
    yield


page = 1
movies = []
for _ in tqdm.tqdm(generator()):
    response = filmweb.search_movies(page)
    if response.status_code != 200:
        print(f'Reached the end of movies list, last page: {page-1}, number of movies: {len(movies)}')
        break
    try:
        movies += response.json()['searchHits']
    except:
        print(f'Cannot parse movies from page {page}.')
    page+=1

Don't worry about above problem as long as the number of pages is reasonable.

In [8]:
df_movies = pd.DataFrame(movies)
df_movies.head()

Unnamed: 0,id,type
0,862,film
1,1048,film
2,998,film
3,671,film
4,9136,film


In [10]:
df_movies.count()

id      4630
type    4630
dtype: int64

Save so far collected data, just in case

In [69]:
df_movies[['id', 'type']].to_csv('intermidate/movies.csv', index=False)

# Get critics

In [14]:
df_critics = pd.DataFrame(filmweb.get_critics().json())
df_critics.head()

Unnamed: 0,id,userName,priority,publisher
0,1164541,michaloleszczyk,1,Spoiler Master Podcast
1,1243927,tru3,1,Magazyn SFP
2,1418184,Cateborough,1,POPcast - podcast o popkulturze
3,1589044,darek_arest,1,krytyk niezależny
4,3281021,smolinskisebastian,1,krytyk niezależny


In [15]:
df_critics.groupby('priority').count()

Unnamed: 0_level_0,id,userName,publisher
priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,9,9,9
2,10,10,10
3,13,13,13
4,17,17,17
5,7,7,7
6,10,10,10
7,1,1,1


In [16]:
df_critics.count()

id           67
userName     67
priority     67
publisher    67
dtype: int64

In [77]:
df_critics.to_csv('intermidate/critics.csv', index=False)

# Dataset preparation
Iterate over critics and get movies which they reviewed

In [144]:
results = []
critics = df_critics['userName'].values.tolist()

In [145]:
it = 0
while len(results) < len(df_critics['userName']):
    for critic in tqdm.tqdm(critics[it:]):
        try:
            movies_ids = list(map(lambda response: {'username': critic, 'movie_id': response[0]}, filmweb.get_reviewed_movies(critic).json()))
            results += movies_ids
        except Exception as e:
            print(f'Exception when getting {critic}\'s reviews. {e}')
            break
        it+=1

100%|██████████| 67/67 [00:10<00:00,  6.37it/s]


In [147]:
df_critics_reviews = pd.DataFrame(results)
df_critics_reviews.head()

Unnamed: 0,username,movie_id
0,michaloleszczyk,10014103
1,michaloleszczyk,10025735
2,michaloleszczyk,805204
3,michaloleszczyk,877700
4,michaloleszczyk,10026692


In [169]:
for i in tqdm.tqdm(df_critics_reviews.values[2000:]):
    pass

100%|██████████| 4324/4324 [00:00<00:00, 3106040.50it/s]


In [170]:
it = 0
results = []
while len(results) < len(df_critics_reviews.values) or it >= len(df_critics_reviews.values):
    for case in tqdm.tqdm(df_critics_reviews.values[it:]):
        try:
            response = filmweb.get_movie_review(critic_username=case[0], movie_id=case[1]).json()
            if 'comment' in response:
                movie_rating = filmweb.get_movie_rating(case[1]).json()
                results.append({'username': case[0], 
                                'movie_id': case[1],
                                'user_rate': response['rate'],
                                'user_comment': response['comment'],
                                'movie_rating': movie_rating['rate'],
                                'movie_rating_count': movie_rating['count']
                                })
        except Exception as e:
            print(f'Fetching data about movie rating failed: {e}')
            break
        it+=1

  3%|▎         | 165/6324 [03:00<1:52:10,  1.09s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/tru3/vote/film/10019460 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157464a0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  5%|▍         | 295/6159 [04:09<1:22:46,  1.18it/s] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/164870/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745db0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  3%|▎         | 176/5864 [02:45<1:29:21,  1.06it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10004934/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745b10>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  5%|▌         | 312/5688 [04:22<1:15:16,  1.19it/s] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10004934/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745b70>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|          | 39/5376 [02:19<5:17:37,  3.57s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/754800/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215746440>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  9%|▉         | 501/5337 [04:10<40:15,  2.00it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/sergiuszowo11/vote/film/10013518 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745f30>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  7%|▋         | 316/4836 [04:34<1:05:25,  1.15it/s] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Jakub_Demianczuk/vote/film/36469 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744b50>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  3%|▎         | 151/4520 [02:56<1:24:55,  1.17s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10020414/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745c90>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|▏         | 65/4369 [02:25<2:40:27,  2.24s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Arwen/vote/film/877594 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157458d0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|          | 22/4304 [02:15<7:18:27,  6.14s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/GrzegorzLaguna/vote/film/120749 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157456c0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  8%|▊         | 350/4282 [04:28<50:13,  1.30it/s]   


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/527837/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745120>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  9%|▊         | 336/3932 [03:32<37:59,  1.58it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10008680/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744df0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  4%|▍         | 157/3596 [03:57<1:26:43,  1.51s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/martabalaga/vote/film/829452 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744bb0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  4%|▎         | 128/3439 [03:39<1:34:28,  1.71s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Cosimo_Villa_Nova/vote/film/878157 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745750>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  6%|▌         | 190/3311 [03:26<56:37,  1.09s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/David_Frost/vote/film/10015096 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744a90>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  2%|▏         | 49/3121 [02:21<2:27:38,  2.88s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/David_Frost/vote/film/671049 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745f00>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  6%|▌         | 190/3072 [04:03<1:01:36,  1.28s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/bkkb/vote/film/710056 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157463e0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 10%|▉         | 286/2882 [04:18<39:02,  1.11it/s]   


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10007169/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157457b0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  4%|▎         | 93/2596 [02:41<1:12:34,  1.74s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/blizej_ekranu/vote/film/8649 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744460>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 13%|█▎        | 333/2503 [03:59<26:01,  1.39it/s]  


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Maciej_Niedzwiedzki/vote/film/10013458 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157459f0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 10%|▉         | 207/2170 [03:39<34:38,  1.06s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/830546/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744ac0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|          | 10/1963 [02:13<7:13:17, 13.31s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/piogus/vote/film/842133 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744b20>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  8%|▊         | 147/1953 [03:00<36:54,  1.23s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/826447/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215746290>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  5%|▌         | 91/1806 [02:39<50:10,  1.76s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/520943/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745330>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|▏         | 22/1715 [02:17<2:55:58,  6.24s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Muszynski/vote/film/1033 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744df0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  7%|▋         | 113/1693 [02:41<37:42,  1.43s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Marcin_P/vote/film/10012008 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745d80>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 15%|█▌        | 239/1580 [04:07<23:10,  1.04s/it]  


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/jultacz/vote/film/10014103 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157452d0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 28%|██▊       | 374/1341 [03:39<09:26,  1.71it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/847034/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157462f0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  6%|▌         | 59/967 [02:27<37:49,  2.50s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/santu/vote/film/857962 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157463b0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 32%|███▏      | 292/908 [04:07<08:42,  1.18it/s]  


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10039003/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215745330>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 32%|███▏      | 198/616 [03:14<06:50,  1.02it/s]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/10019146/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744eb0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 53%|█████▎    | 223/418 [04:24<03:51,  1.18s/it] 


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/MCh092788/vote/film/695409 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc215744f70>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 18%|█▊        | 36/195 [02:17<10:06,  3.81s/it]


Fetching data about movie rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/MCh092788/vote/film/31672 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2157456c0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


100%|██████████| 159/159 [00:40<00:00,  3.90it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


KeyboardInterrupt: 

In [171]:
ds_df = pd.DataFrame(results)
ds_df.head()

Unnamed: 0,username,movie_id,user_rate,user_comment,movie_rating,movie_rating_count
0,michaloleszczyk,10014103,8,Willa Hössa jako makieta najciemniejszej stron...,8.32095,296
1,michaloleszczyk,10025735,3,"Zgentryfikowana ""Mama i dziwka"" o smaku pumpki...",6.41777,833
2,michaloleszczyk,805204,9,"""Ile wilków widzisz na obrazku?"" A ile widzisz...",7.41684,9884
3,michaloleszczyk,10034333,9,Film bliski perfekcji; czysty i syty jak ronde...,7.36086,557
4,michaloleszczyk,10015490,9,Wielka niespodzianka i najbardziej oryginalny ...,5.44533,375


In [172]:
ds_df.count()

username              3057
movie_id              3057
user_rate             3057
user_comment          3057
movie_rating          3057
movie_rating_count    3057
dtype: int64

In [179]:
ds_df.to_json('../dataset/dataset_movies.json')

In [180]:
ds_df.to_csv('../dataset/dataset_movies.csv', index=False, sep='$', encoding='utf8')

TODO: fetch tv series analogously to movies
DO NOT MERGE THE DATASETS! We may consider if we want to train model only on movies, tv series or both
# TV series

In [5]:
df_critics = pd.read_csv('intermidate/critics.csv')
df_critics.head()

Unnamed: 0.1,Unnamed: 0,id,userName,priority,publisher
0,0,1164541,michaloleszczyk,1,Spoiler Master Podcast
1,1,1243927,tru3,1,Magazyn SFP
2,2,1418184,Cateborough,1,POPcast - podcast o popkulturze
3,3,1589044,darek_arest,1,krytyk niezależny
4,4,3281021,smolinskisebastian,1,krytyk niezależny


In [6]:
result = []
for critic in tqdm.tqdm(df_critics['userName']):
    response = filmweb.get_reviewed_servies(critic).json()
    result += list(map(lambda review: {'username':critic, 'series_id':review[0]}, response))
df_critics_reviews_series = pd.DataFrame(result)
df_critics_reviews_series.head()

100%|██████████| 67/67 [00:11<00:00,  5.79it/s]


Unnamed: 0,username,series_id
0,michaloleszczyk,94606
1,michaloleszczyk,867132
2,michaloleszczyk,868514
3,michaloleszczyk,876630
4,michaloleszczyk,862322


In [7]:
it = 0
results = []
while len(results) < len(df_critics_reviews_series.values):
    for case in tqdm.tqdm(df_critics_reviews_series.values[it:]):
        try:
            response = filmweb.get_tv_series_review(critic_username=case[0], series_id=case[1]).json()
            if 'comment' in response:
                series_rating = filmweb.get_movie_rating(case[1]).json()
                results.append({'username': case[0], 
                                'series_id': case[1],
                                'user_rate': response['rate'],
                                'user_comment': response['comment'],
                                'series_rating': series_rating['rate'],
                                'series_rating_count': series_rating['count']
                                })
        except Exception as e:
            print(f'Fetching data about tv series rating failed: {e}')
            break
        it+=1
    if it >= len(df_critics_reviews_series.values):
        break

  7%|▋         | 262/3734 [03:08<41:35,  1.39it/s]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/makina13/vote/serial/843886 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f603320fc40>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  9%|▉         | 329/3472 [04:30<43:02,  1.22it/s]  


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/ksiawcik_13/vote/serial/809555 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f60329545e0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  2%|▏         | 73/3143 [02:25<1:41:54,  1.99s/it]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/750359/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032954a60>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  5%|▍         | 145/3070 [02:39<53:42,  1.10s/it]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Igi3/vote/serial/746188 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032954b20>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  3%|▎         | 79/2925 [02:27<1:28:31,  1.87s/it]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Cinestetyk/vote/serial/809758 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032954a30>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  9%|▉         | 261/2846 [03:10<31:26,  1.37it/s]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/bartosz_czartoryski/vote/serial/10011822 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f60329544c0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 10%|▉         | 252/2585 [04:18<39:48,  1.02s/it]  


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/GrzegorzLaguna/vote/serial/646514 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032955870>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  7%|▋         | 167/2333 [02:43<35:24,  1.02it/s]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/832704/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032955db0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  6%|▌         | 133/2166 [02:49<43:18,  1.28s/it]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/no_face_no/vote/serial/876630 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032954dc0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  6%|▋         | 128/2033 [02:39<39:37,  1.25s/it]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Cosimo_Villa_Nova/vote/serial/801643 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f60333891b0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 19%|█▉        | 360/1905 [04:46<20:30,  1.26it/s]  


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/olberlin/vote/serial/87721 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f60329544f0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  0%|          | 0/1545 [02:11<?, ?it/s]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/olberlin/vote/serial/87721 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032955540>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 22%|██▏       | 344/1545 [03:39<12:45,  1.57it/s]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/Tutaj_17/vote/serial/10005323 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f60329551e0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


  1%|          | 10/1201 [02:13<4:24:13, 13.31s/it]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/868116/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032954130>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 25%|██▌       | 302/1191 [03:10<09:20,  1.59it/s]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/jultacz/vote/serial/848385 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f60329549a0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 20%|██        | 180/889 [03:49<15:03,  1.27s/it]  


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/EvvLe/vote/serial/875799 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f60329542e0>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 19%|█▉        | 135/709 [02:39<11:19,  1.18s/it]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/film/793789/rating (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032955720>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 27%|██▋       | 154/574 [02:45<07:32,  1.08s/it]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/mwol/vote/serial/99424 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032955990>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


 77%|███████▋  | 322/420 [03:35<01:05,  1.50it/s]


Fetching data about tv series rating failed: HTTPSConnectionPool(host='www.filmweb.pl', port=443): Max retries exceeded with url: /api/v1/user/dem3000/vote/serial/654010 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f6032955c60>, 'Connection to www.filmweb.pl timed out. (connect timeout=None)'))


100%|██████████| 98/98 [00:28<00:00,  3.50it/s]


In [8]:
ds_df = pd.DataFrame(results)
ds_df.head()

Unnamed: 0,username,series_id,user_rate,user_comment,series_rating,series_rating_count
0,michaloleszczyk,867132,8,NH. Grand finale mock-satanistycznej komedii V...,7.57339,436
1,michaloleszczyk,862322,10,Koronkowe komiczne eseje dokumentalne o urokac...,8.27645,4178
2,michaloleszczyk,819951,7,"Znakomicie zrealizowany serial, którego ostatn...",7.85234,10409
3,tru3,10011884,8,Ale to jest dobre! Absolutnie zaskakujący seri...,7.62731,7199
4,tru3,860894,5,Serial pozerski. Główny bohater - Czuły (Borys...,6.07103,4885


In [10]:
ds_df.to_json('../dataset/dataset_tv_series.json')
ds_df.to_csv('../dataset/dataset_tv_series.csv', sep='$', encoding='utf-8')

# Datasets merge

In [51]:
df_movies = pd.read_csv('../dataset/dataset_movies.csv', encoding='utf-8', sep='$')
df_tv_series = pd.read_csv('../dataset/dataset_series2.csv', encoding='utf-8', sep='$')

In [52]:
df_movies = df_movies.rename(axis='columns', mapper={'movie_id': 'show_id', 
                                                     'movie_rating': 'show_rating', 
                                                     'movie_rating_count': 'show_rating_count'})
df_movies.head()

Unnamed: 0,username,show_id,user_rate,user_comment,show_rating,show_rating_count
0,michaloleszczyk,10014103,8,Willa Hössa jako makieta najciemniejszej stron...,8.32095,296
1,michaloleszczyk,10025735,3,"Zgentryfikowana ""Mama i dziwka"" o smaku pumpki...",6.41777,833
2,michaloleszczyk,805204,9,"""Ile wilków widzisz na obrazku?"" A ile widzisz...",7.41684,9884
3,michaloleszczyk,10034333,9,Film bliski perfekcji; czysty i syty jak ronde...,7.36086,557
4,michaloleszczyk,10015490,9,Wielka niespodzianka i najbardziej oryginalny ...,5.44533,375


In [53]:
df_tv_series = df_tv_series.rename(axis='columns', mapper={'series_id': 'show_id', 
                                                           'series_rating': 'show_rating', 
                                                           'series_rating_count': 'show_rating_count'})
df_tv_series.head()

Unnamed: 0,username,show_id,user_rate,user_comment,show_rating,show_rating_count
0,michaloleszczyk,867132,8,NH. Grand finale mock-satanistycznej komedii V...,7.57339,436
1,michaloleszczyk,862322,10,Koronkowe komiczne eseje dokumentalne o urokac...,8.27645,4178
2,michaloleszczyk,819951,7,"Znakomicie zrealizowany serial, którego ostatn...",7.85234,10409
3,tru3,10011884,8,Ale to jest dobre! Absolutnie zaskakujący seri...,7.62731,7199
4,tru3,860894,5,Serial pozerski. Główny bohater - Czuły (Borys...,6.07103,4885


In [54]:
df_movies['type'] = 'movie'
df_tv_series['type'] = 'tv_series'

In [55]:
df_dataset = pd.concat([df_movies, df_tv_series], axis=0)
df_dataset

Unnamed: 0,username,show_id,user_rate,user_comment,show_rating,show_rating_count,type
0,michaloleszczyk,10014103,8,Willa Hössa jako makieta najciemniejszej stron...,8.32095,296,movie
1,michaloleszczyk,10025735,3,"Zgentryfikowana ""Mama i dziwka"" o smaku pumpki...",6.41777,833,movie
2,michaloleszczyk,805204,9,"""Ile wilków widzisz na obrazku?"" A ile widzisz...",7.41684,9884,movie
3,michaloleszczyk,10034333,9,Film bliski perfekcji; czysty i syty jak ronde...,7.36086,557,movie
4,michaloleszczyk,10015490,9,Wielka niespodzianka i najbardziej oryginalny ...,5.44533,375,movie
...,...,...,...,...,...,...,...
1326,alekhudzik,849662,3,O wow. Nie udało się zekranizować gry komputer...,7.77990,58245,tv_series
1327,alekhudzik,10012801,6,"No takie ten. ""1 świat 3 świat"", kalejdoskop p...",6.79320,353,tv_series
1328,alekhudzik,875491,7,Dwójka lepsza od jedynki. Zamykamy dyskusję,7.67484,40515,tv_series
1329,alekhudzik,877025,10,"Jest to najlepszy serial ""Coming of age"" świat...",7.22280,579,tv_series


In [57]:
df_dataset = df_dataset.reset_index(drop=True)

In [59]:
df_dataset.to_csv('../dataset/dataset.csv', encoding='utf-8', sep='$', index=False)
df_dataset.to_json('../dataset/dataset.json')

# Dataset cleanup
Filter out empty comments

In [84]:
df_dataset = pd.read_csv('../dataset/dataset.csv', sep='$', encoding='utf-8')
df_dataset

Unnamed: 0,username,show_id,user_rate,user_comment,show_rating,show_rating_count,type
0,michaloleszczyk,10014103,8,Willa Hössa jako makieta najciemniejszej stron...,8.32095,296,movie
1,michaloleszczyk,10025735,3,"Zgentryfikowana ""Mama i dziwka"" o smaku pumpki...",6.41777,833,movie
2,michaloleszczyk,805204,9,"""Ile wilków widzisz na obrazku?"" A ile widzisz...",7.41684,9884,movie
3,michaloleszczyk,10034333,9,Film bliski perfekcji; czysty i syty jak ronde...,7.36086,557,movie
4,michaloleszczyk,10015490,9,Wielka niespodzianka i najbardziej oryginalny ...,5.44533,375,movie
...,...,...,...,...,...,...,...
4383,alekhudzik,849662,3,O wow. Nie udało się zekranizować gry komputer...,7.77990,58245,tv_series
4384,alekhudzik,10012801,6,"No takie ten. ""1 świat 3 świat"", kalejdoskop p...",6.79320,353,tv_series
4385,alekhudzik,875491,7,Dwójka lepsza od jedynki. Zamykamy dyskusję,7.67484,40515,tv_series
4386,alekhudzik,877025,10,"Jest to najlepszy serial ""Coming of age"" świat...",7.22280,579,tv_series


In [85]:
df_dataset = df_dataset[~df_dataset['user_comment'].isna()].reset_index(drop=True)
df_dataset = df_dataset[df_dataset['user_comment'].str.len() > 3].reset_index(drop=True)
df_dataset

Unnamed: 0,username,show_id,user_rate,user_comment,show_rating,show_rating_count,type
0,michaloleszczyk,10014103,8,Willa Hössa jako makieta najciemniejszej stron...,8.32095,296,movie
1,michaloleszczyk,10025735,3,"Zgentryfikowana ""Mama i dziwka"" o smaku pumpki...",6.41777,833,movie
2,michaloleszczyk,805204,9,"""Ile wilków widzisz na obrazku?"" A ile widzisz...",7.41684,9884,movie
3,michaloleszczyk,10034333,9,Film bliski perfekcji; czysty i syty jak ronde...,7.36086,557,movie
4,michaloleszczyk,10015490,9,Wielka niespodzianka i najbardziej oryginalny ...,5.44533,375,movie
...,...,...,...,...,...,...,...
4276,alekhudzik,849662,3,O wow. Nie udało się zekranizować gry komputer...,7.77990,58245,tv_series
4277,alekhudzik,10012801,6,"No takie ten. ""1 świat 3 świat"", kalejdoskop p...",6.79320,353,tv_series
4278,alekhudzik,875491,7,Dwójka lepsza od jedynki. Zamykamy dyskusję,7.67484,40515,tv_series
4279,alekhudzik,877025,10,"Jest to najlepszy serial ""Coming of age"" świat...",7.22280,579,tv_series


In [None]:
df_dataset.to_csv('../dataset/dataset.csv', encoding='utf-8', sep='$', index=False)
df_dataset.to_json('../dataset/dataset.json')