In [1]:
import json
import urllib.request
import pandas as pd
import numpy as np

In [2]:
#creating a vector containing the columns names of a DataFrame
names = ['ratingMPAA', 'budgetData', 'ratingData', 
         'is3D', 'posterURL', 'webURL', 'filmLength', 
         'ratingAgeLimits', 'videoURL', 'genre', 'nameEN', 'hasSeance', 'isHasSimilarFilms', 
         'filmID', 'reviewsCount', 'country', 'isIMAX', 'isHasSequelsAndPrequelsFilms', 'rentData', 
         ]
print(names)

['ratingMPAA', 'budgetData', 'ratingData', 'is3D', 'posterURL', 'webURL', 'filmLength', 'ratingAgeLimits', 'videoURL', 'genre', 'nameEN', 'hasSeance', 'isHasSimilarFilms', 'filmID', 'reviewsCount', 'country', 'isIMAX', 'isHasSequelsAndPrequelsFilms', 'rentData']


In [3]:
#Write a function to get the data about the Film by ID
def getFilm(id):
    """
    Input: kinopoisk ID of the Movie
    Output: JSON object
    """
    URL = 'http://api.kinopoisk.cf/getFilm?filmID=%s'%id
    result = urllib.request.urlopen(URL)
    encoding = 'utf-8'
    json_text = result.read().decode(encoding)
    if json_text == 'null':
        return None
    else:
        return json.loads(json_text)

In [4]:
#Write a function that creates a DataFrame based on JSON object
def totalDataSet(json_file, filmId):
    """
    Input: JSON file containing an information about the Film, kinopoisk ID of the Movie
    Output: Pandas DataFrame    
    """
    totalFrame = pd.DataFrame(index=[filmId], columns=['FilmID'], data=[filmId])
    for name in names:
        if name not in json_file.keys():
            if name in ['budgetData', 'ratingData', 'rentData']:
                data = pd.DataFrame(["NaN"], index=[filmId])
                totalFrame = pd.concat([totalFrame, data], axis=1)
            else:
                data = pd.DataFrame(["NaN"], index=[filmId], columns=[name])
                totalFrame = pd.concat([totalFrame, data], axis=1)
        elif name in ['budgetData', 'ratingData', 'rentData', ]:
            data = pd.DataFrame(json_file[name], index=[filmId])
            totalFrame = pd.concat([totalFrame, data], axis=1)
        else:
            data = pd.DataFrame(json_file[name], index=[filmId], columns=[name])
            totalFrame = pd.concat([totalFrame, data], axis=1)
    return totalFrame

In [5]:
def getFilmsbyRange(startId):
    """
    Getting data for the films with ID between [startId, endId) and returning a tuple (df, filename)
    Input: startId, endId
    Output: file containing the dataframe
    """
    endId = startId + 1000
    filename = "./data/moviesID%s_%s.pkl"%(startId, endId)
    frame_to_save = pd.DataFrame(index=[0], columns=['FilmID'], data=[None])
    for i in range(startId, endId):
        json_film = getFilm(i)
        #print(json_film)
        if json_film is None:
            continue
        df_film = totalDataSet(json_film, i)
        if df_film is not None:
            if 0 in df_film.columns:
                df_film = df_film.drop(0, 1)
            df_film.drop_duplicates(keep=False)
        frame_to_save = frame_to_save.append(df_film)
    #print(frame_to_save)
    print("OK")
    return (frame_to_save, filename)

In [6]:
def saveDFToFile(data):
    """
    Input: tuple (df, filename), where df is a dataframe
    Output: file containing the df
    """
    filename = data[1]
    dataframe = data[0]
    import pickle
    file = open(filename, 'wb')
    pickle.dump(dataframe, file)
    file.close()
    print("Everything is fine. Data is saved.")

In [7]:
def getResfromPickle(filename):
    """
    Simply opens a file and loads a dataframe from it
    Input: filename
    Output: dataframe
    """
    import pickle
    file = open(filename, 'rb')
    resultDF = pickle.load(file)
    file.close()
    return resultDF

In [8]:
#example usage. Getting films from 1000 to 2000
#filmData = getFilmsbyRange(41000)

In [9]:
#saving data
#saveDFToFile(filmData)

In [10]:
#loading data from a file
#getResfromPickle("./moviesID41000_42000.pkl")

# Using multiprocessing

In [11]:
#from multiprocessing import Pool
#from multiprocessing.dummy import Pool as ThreadPool

#pool = ThreadPool(8)

In [29]:
#a = [1000*i for i in range(200,225)]

In [1]:
#%%time
#data = pool.map(getFilmsbyRange, a)

In [2]:
#for i in data:
#    saveDFToFile(i)

In [32]:
#a = [1000*i for i in range(225,250)]

In [3]:
#%%time
#data = pool.map(getFilmsbyRange, a)

In [4]:
#for i in data:
#    saveDFToFile(i)

In [35]:
#a = [1000*i for i in range(250,275)]

In [5]:
#%%time
#data = pool.map(getFilmsbyRange, a)

In [None]:
#for i in data:
#    saveDFToFile(i)

In [None]:
#a = [1000*i for i in range(275,300)]

In [None]:
#%%time
#data = pool.map(getFilmsbyRange, a)

In [None]:
#for i in data:
    saveDFToFile(i)

# Creating a final dataframe

In [None]:
#import os

In [None]:
#files = os.listdir("./data/")

In [None]:
#os.chdir("./data/")
#total_frame = pd.DataFrame(index=[0], columns=['FilmID'], data=[None])
#for file in files:
#    total_frame = total_frame.append(getResfromPickle(file))

# Preprocessing data

In [None]:
#import matplotlib.pyplot as plt
#%matplotlib inline

#import numpy as np

In [None]:
#deleting unnecesary data
#df = total_frame[total_frame.index != 0]

In [None]:
#def fromStrToNum(string):
#    return int(string.replace(" ", ""))

In [None]:
#исправляем данные о бюджете, кассовых сборах там, где тип данных object
#moneyData = ['budget', 'grossRU', 'grossUSA', 'grossWorld']

#def repairMoney(moneyData = moneyData):
#    global df
#    for colname in moneyData:
#        df[colname] = df[df[colname].isnull() == False][colname].apply(fromStrToNum)

In [None]:
#repairMoney()

In [None]:
#df.dtypes

In [None]:
#df[df['budget'] > 4000000]

In [None]:
#total_frame.to_csv("../totalFrame.csv")