In [76]:
import json
import urllib.request
import pandas as pd
import numpy as np

In [77]:
#creating a vector containing the columns names of a DataFrame
names = ['ratingMPAA', 'budgetData', 'ratingData', 
         'is3D', 'posterURL', 'webURL', 'filmLength', 
         'ratingAgeLimits', 'videoURL', 'genre', 'nameEN', 'hasSeance', 'isHasSimilarFilms', 
         'filmID', 'reviewsCount', 'country', 'isIMAX', 'isHasSequelsAndPrequelsFilms', 'rentData', 
         ]
print(names)

['ratingMPAA', 'budgetData', 'ratingData', 'is3D', 'posterURL', 'webURL', 'filmLength', 'ratingAgeLimits', 'videoURL', 'genre', 'nameEN', 'hasSeance', 'isHasSimilarFilms', 'filmID', 'reviewsCount', 'country', 'isIMAX', 'isHasSequelsAndPrequelsFilms', 'rentData']


In [5]:
#Write a function to get the data about the Film by ID
def getFilm(id):
    """
    Input: kinopoisk ID of the Movie
    Output: JSON object
    """
    URL = 'http://api.kinopoisk.cf/getFilm?filmID=%s'%id
    result = urllib.request.urlopen(URL)
    encoding = 'utf-8'
    json_text = result.read().decode(encoding)
    if json_text == 'null':
        return None
    else:
        return json.loads(json_text)

In [6]:
#Write a function that creates a DataFrame based on JSON object
def totalDataSet(json_file, filmId):
    """
    Input: JSON file containing an information about the Film, kinopoisk ID of the Movie
    Output: Pandas DataFrame    
    """
    totalFrame = pd.DataFrame(index=[filmId], columns=['FilmID'], data=[filmId])
    for name in names:
        if name not in json_file.keys():
            if name in ['budgetData', 'ratingData', 'rentData']:
                data = pd.DataFrame(["NaN"], index=[filmId])
                totalFrame = pd.concat([totalFrame, data], axis=1)
            else:
                data = pd.DataFrame(["NaN"], index=[filmId], columns=[name])
                totalFrame = pd.concat([totalFrame, data], axis=1)
        elif name in ['budgetData', 'ratingData', 'rentData', ]:
            data = pd.DataFrame(json_file[name], index=[filmId])
            totalFrame = pd.concat([totalFrame, data], axis=1)
        else:
            data = pd.DataFrame(json_file[name], index=[filmId], columns=[name])
            totalFrame = pd.concat([totalFrame, data], axis=1)
    return totalFrame

In [138]:
def getFilmsbyRange(params):
    """
    Getting data for the films with ID between [startId, endId) and returning a tuple (df, filename)
    Input: startId, endId
    Output: file containing the dataframe
    """
    startId = params[0]
    endId = params[1]
    filename = "moviesID%s_%s.pkl"%(startId, endId)
    frame_to_save = pd.DataFrame(index=[0], columns=['FilmID'], data=[None])
    for i in range(startId, endId):
        json_film = getFilm(i)
        if json_film is None:
            return None
        df_film = totalDataSet(json_film, i)
        if df_film is not None:
            if 0 in df_film.columns:
                df_film = df_film.drop(0, 1)
            df_film.drop_duplicates(keep=False)
        frame_to_save = frame_to_save.append(df_film)
    return (frame_to_save, filename)

In [72]:
def saveDFToFile(data):
    """
    Input: tuple (df, filename), where df is a dataframe
    Output: file containing the df
    """
    filename = data[1]
    dataframe = data[0]
    import pickle
    file = open(filename, 'wb')
    pickle.dump(dataframe, file)
    file.close()
    print("Everything is fine. Data is saved.")

In [73]:
def getResfromPickle(filename):
    """
    Simply opens a file and loads a dataframe from it
    Input: filename
    Output: dataframe
    """
    import pickle
    file = open(filename, 'rb')
    resultDF = pickle.load(file)
    file.close()
    return resultDF

In [75]:
#example usage. Getting films from 1000 to 2000
filmData = get1000films(1000, 2000)

In [68]:
#saving data
saveDFToFile(filmData)

In [82]:
#loading data from a file
getResfromPickle("./moviesID1100_1150.pkl").head()

Unnamed: 0,FilmID,budget,country,distributorRelease,filmID,filmLength,genre,grossUSA,grossWorld,hasSeance,...,ratingFilmCriticsVoteCount,ratingGoodReview,ratingGoodReviewVoteCount,ratingIMDb,ratingIMDbVoteCount,ratingMPAA,ratingVoteCount,reviewsCount,videoURL,webURL
0,,,,,,,,,,,...,,,,,,,,,,
1100,1100.0,,"Германия, США",,1100.0,1:35,"драма, комедия",912 082,,,...,26.0,,,5.3,2 266,PG-13,38.0,,,http://www.kinopoisk.ru/film/1100/
1101,1101.0,,США,,1101.0,1:47,"драма, биография",89 611,,,...,,,,6.6,1 161,PG,42.0,,,http://www.kinopoisk.ru/film/1101/
1102,1102.0,40 000 000,"Великобритания, Япония",,1102.0,2:02,"драма, комедия",1 519 366,,,...,12.0,50%,1.0,5.3,3 249,PG-13,390.0,4.0,,http://www.kinopoisk.ru/film/1102/
1103,1103.0,13 000 000,США,,1103.0,1:38,"мелодрама, комедия",3 125 424,,,...,19.0,100%,2.0,5.9,2 589,PG-13,169.0,2.0,,http://www.kinopoisk.ru/film/1103/


# Using multiprocessing

In [141]:
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool

pool = ThreadPool(8)

In [139]:
#%%time
a = [[1000*i, 0] for i in range(1,50)]
b = [[0, 1000+1000*k] for k in range(1,50)]

for i in range(len(a)):
    a[i][1] = b[i][1]
    a[i] = tuple(a[i])

In [144]:
pool.map(getFilmsbyRange, a)