In [1]:
import sys, os, pathlib
#setting the path to folder with modules
sys.path.insert(0, str(pathlib.Path(os.getcwd()).parents[1] / 'python'))

In [2]:
#basic imports
import sys, os, copy
import numpy as np
import datetime as dt
import pandas as pd
#import libraries for file checking
from os.path import isfile, join, isdir

#importing json (data are mostly in JSON)
import json

In [3]:
FileAddress_movies ="../../Datasets/tmdb_5000_movies.csv"
FileAddress_credits="../../Datasets/tmdb_5000_credits.csv"

In [41]:
def Load_Datasets(FileAddress_movies,FileAddress_credits):
    ##input: location of files
    ##output: pandas dataframe containing all information on movies and credits
    
    def InputFilesFound(FileAddress):
        ##input:  file location
        ##output: True if file found, False otherwise
        return isfile(FileAddress) and not isdir(FileAddress)
    
    def Transform_LoadJSON(dataframe):
        ##input:  dataframe
        ##output: datafreme which JSON columns transformed     
        #itterating through JSON columns and loading json 
        JSONcolumns = IdentifyJSONcolumns(dataframe)
        for column in JSONcolumns:
            dataframe[column] = dataframe[column].apply(json.loads)
        return dataframe,JSONcolumns
    
    def IdentifyJSONcolumns(dataframe):
        ##input: dataframe 
        ##output: list of columns containing JSON  
        #getting list of collumn names
        columns=list(dataframe)
        JSONcolumns=[]
        #itteration though columns to find those with JSON
        for column in columns:
            try:
                json.loads(dataframe[column][0]) 
            except: continue
            JSONcolumns.append(column)   
        #returning list of columns in which JSON format was found
        return JSONcolumns
    
    def JSONtoKeyList(JSONentry,key):
        
        INNERentries = []
        for InnerEntry in JSONentry:
            INNERentries.append(InnerEntry[key])
        
        if len(INNERentries)>0:
            if key =='gender':
                outcome=''
                for entry in INNERentries:
                    outcome = outcome+str(entry)+','
                outcome=outcome[:len(outcome)-1]      
                return  outcome
            else:
                return  ','.join(INNERentries)
        return ''
        
        
    
    def JSONtoNameList(JSONentry):
        ##input: entry (one line) from JSON one of JSON columns
        ##output: strings of entries separated by commas
        return JSONtoKeyList(JSONentry,'name')
    
    def JSONtoGenderList(JSONentry):
        return JSONtoKeyList(JSONentry,'gender')
        
        
    def Transform_JSONcolumnsDecapsulation(dataframe):
        ##input: dataframe
        ##output: dataframe which JSON columns decapsulated
        
        def GetJSONkeys(JSONunit):
            ##input: JSON data unit
            ##output: List of keys in the JSON dictionary
            JSONkeys=[]
            for key in JSONunit:
                JSONkeys.append(key)
            return JSONkeys   
    
   
        def Accesor(bucket, locator):
            ##input: values to be read safely, index value in list
            ##output: value (if exists), otherwise nan
            try:
                return bucket[locator]
            except IndexError or KeyError:
                return pd.np.nan
           
        #allowing for changes of passed datased
        dataframe.is_copy = False 
        #reading JSON format and columns
        dataframe,JSONcolumns = Transform_LoadJSON(dataframe)
        #transforming JSON columns to text columns
        for column in JSONcolumns:
            dataframe[column] =dataframe[column].apply(JSONtoNameList)
        return dataframe

    #Loading movies from file to dataframe
    def Load_movies(FileAddress_movies):
        ##input:  movie dataset location
        ##output: pandas Frame containing information about movies
        #reading raw dataset
        df_movies = pd.read_csv(FileAddress_movies)
        #decaplsulating json, making columns from list keys
        df_movies = Transform_JSONcolumnsDecapsulation(df_movies)
        return df_movies
    
    def Load_credits(FileAddress_credits):
        df_credits = pd.read_csv(FileAddress_credits)
        df_credits = Transform_LoadJSON(df_credits)
        
        credits = pd.DataFrame()
        
        credits['title'] = df_credits[0]['title']
        credits['actors']  = df_credits[0]['cast'].apply(JSONtoNameList)
        credits['actor_gender']  = df_credits[0]['cast'].apply(JSONtoGenderList)          
       
        return credits
    
    #assuring that both dataset exists
    assert InputFilesFound(FileAddress_movies),  "Movies  input file not found"
    assert InputFilesFound(FileAddress_credits), "Credits input file not found"
    
    Credentials = Load_credits(FileAddress_credits)
    Final_dataset=Load_movies(FileAddress_movies)
    
    #returning final dataset
    return Final_dataset,Credentials

In [42]:
#Loading information about movies
Movies,Credentials  = Load_Datasets(FileAddress_movies,FileAddress_credits)

Drops = ['homepage','status','id']
for drop in Drops:
    Movies = Movies.drop(drop, 1)
        
    

Movies.to_csv("../../Datasets/Transformed.csv")
#print(Movies.dtypes)
#read arrays from string using string.split(",")

In [43]:
Credentials

Unnamed: 0,title,actors,actor_gender
0,Avatar,"Sam Worthington,Zoe Saldana,Sigourney Weaver,S...","2,1,1,2,1,2,2,1,2,2,2,2,2,2,2,2,0,0,2,0,2,0,0,..."
1,Pirates of the Caribbean: At World's End,"Johnny Depp,Orlando Bloom,Keira Knightley,Stel...","2,2,1,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,0,1,1,2,0,..."
2,Spectre,"Daniel Craig,Christoph Waltz,Léa Seydoux,Ralph...","2,2,1,2,1,2,1,2,2,2,2,0,1,0,1,2,0,2,0,2,2,2,2,..."
3,The Dark Knight Rises,"Christian Bale,Michael Caine,Gary Oldman,Anne ...","2,2,2,1,2,1,2,2,2,1,2,2,2,2,2,0,0,1,2,2,0,0,0,..."
4,John Carter,"Taylor Kitsch,Lynn Collins,Samantha Morton,Wil...","2,1,1,2,2,2,2,2,2,2,1,2,2,2,1,2,0,1,2,2,2,2,2,..."
5,Spider-Man 3,"Tobey Maguire,Kirsten Dunst,James Franco,Thoma...","2,1,2,2,2,1,1,2,2,1,2,2,1,2,1,2,0,2,1,2,2,2,2,..."
6,Tangled,"Zachary Levi,Mandy Moore,Donna Murphy,Ron Perl...",2112222000222
7,Avengers: Age of Ultron,"Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo...","2,2,2,2,1,2,2,2,2,2,1,2,1,2,1,2,1,2,1,2,2,1,2,..."
8,Harry Potter and the Half-Blood Prince,"Daniel Radcliffe,Rupert Grint,Emma Watson,Tom ...","2,2,1,2,2,2,1,2,1,2,2,2,1,2,1,1,2,2,2,2,1,1,2,..."
9,Batman v Superman: Dawn of Justice,"Ben Affleck,Henry Cavill,Gal Gadot,Amy Adams,J...","2,2,1,1,2,1,0,2,1,2,2,1,1,2,1,2,2,2,0,0,2,0,1,..."
