In [1]:
import pandas as pd
import re
import os
import sys
import numpy as np

In [2]:
dataset = pd.read_csv('./tmdb_movies_data/tmdb_movies_data.csv')

In [3]:
dataset.head()

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,101199955.5,271619000.0
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,...,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,183999919.0,1902723000.0
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,...,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,174799923.1,1385749000.0


In [4]:
movies_selected = dataset[['id', 'cast', 'director', 'budget', 'genres', 'overview', 'popularity', 'production_companies', 'release_year', 'revenue', 'runtime', 'vote_average', 'vote_count']]

In [5]:
movies_selected.head()

Unnamed: 0,id,cast,director,budget,genres,overview,popularity,production_companies,release_year,revenue,runtime,vote_average,vote_count
0,135397,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,150000000,Action|Adventure|Science Fiction|Thriller,Twenty-two years after the events of Jurassic ...,32.985763,Universal Studios|Amblin Entertainment|Legenda...,2015,1513528810,124,6.5,5562
1,76341,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,150000000,Action|Adventure|Science Fiction|Thriller,An apocalyptic story set in the furthest reach...,28.419936,Village Roadshow Pictures|Kennedy Miller Produ...,2015,378436354,120,7.1,6185
2,262500,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,110000000,Adventure|Science Fiction|Thriller,Beatrice Prior must confront her inner demons ...,13.112507,Summit Entertainment|Mandeville Films|Red Wago...,2015,295238201,119,6.3,2480
3,140607,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,200000000,Action|Adventure|Science Fiction|Fantasy,Thirty years after defeating the Galactic Empi...,11.173104,Lucasfilm|Truenorth Productions|Bad Robot,2015,2068178225,136,7.5,5292
4,168259,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,190000000,Action|Crime|Thriller,Deckard Shaw seeks revenge against Dominic Tor...,9.335014,Universal Pictures|Original Film|Media Rights ...,2015,1506249360,137,7.3,2947


In [6]:
all_cast = dict()
all_directors = dict()
all_companies = dict()
all_genres = set()

In [7]:
def prep_cast(data, cast_member_number):
    cast_members = data.split('|')
    if len(cast_members) >= cast_member_number:
        cast_members = cast_members[:cast_member_number]
    for member in cast_members:
        if member in all_cast:
            all_cast[member] += 1
        else:
            all_cast[member] = 1
    return cast_members
def prep_directors(data):
    if data in all_directors:
        all_directors[data] += 1
    else:
        all_directors[data] = 1
    return data
def prep_genres(data):
    genres = data.split('|')
    for genre in genres:
        all_genres.add(genre)
    return genres
def prep_companies(data):
    companies = data.split('|')
    for company in companies:
        if company in all_companies:
            all_companies[company] += 1
        else:
            all_companies[company] = 1
    return companies

In [8]:
def remove_empty_rows(df):
    df = df.dropna()
    df = df[df['cast'] != '[]']
    df = df[df['genres'] != '[]']
    df = df[df['production_companies'] != '[]']
    df = df[df['budget'] != 0]
    df = df[df['runtime'] != 0]
    df = df[df['vote_average'] != 0]
    return df

In [9]:
def preprocess_data_first_stage(df, cast_member_number):
    df = remove_empty_rows(df)
    df['cast'] = df['cast'].apply(lambda x: prep_cast(x, cast_member_number))
    df['director'] = df['director'].apply(lambda x: prep_directors(x))
    df['genres'] = df['genres'].apply(lambda x: prep_genres(x))
    df['production_companies'] = df['production_companies'].apply(lambda x: prep_companies(x))
    return df

In [10]:
first_preprocessing = preprocess_data_first_stage(movies_selected, 5)

In [11]:
first_preprocessing = first_preprocessing[['cast', 'director', 'budget', 'genres', 'production_companies', 'runtime', 'release_year', 'vote_average']]

In [12]:
first_preprocessing.head()

Unnamed: 0,cast,director,budget,genres,production_companies,runtime,release_year,vote_average
0,"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...",Colin Trevorrow,150000000,"[Action, Adventure, Science Fiction, Thriller]","[Universal Studios, Amblin Entertainment, Lege...",124,2015,6.5
1,"[Tom Hardy, Charlize Theron, Hugh Keays-Byrne,...",George Miller,150000000,"[Action, Adventure, Science Fiction, Thriller]","[Village Roadshow Pictures, Kennedy Miller Pro...",120,2015,7.1
2,"[Shailene Woodley, Theo James, Kate Winslet, A...",Robert Schwentke,110000000,"[Adventure, Science Fiction, Thriller]","[Summit Entertainment, Mandeville Films, Red W...",119,2015,6.3
3,"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...",J.J. Abrams,200000000,"[Action, Adventure, Science Fiction, Fantasy]","[Lucasfilm, Truenorth Productions, Bad Robot]",136,2015,7.5
4,"[Vin Diesel, Paul Walker, Jason Statham, Miche...",James Wan,190000000,"[Action, Crime, Thriller]","[Universal Pictures, Original Film, Media Righ...",137,2015,7.3


In [13]:
len(all_cast), len(all_directors), len(all_companies), len(all_genres)

(9171, 2298, 4486, 20)

In [14]:
all_cast_dict = [key for key, value in sorted(all_cast.items(), key=lambda kv: kv[1], reverse=True)[:500]]
all_directors_dict = [key for key, value in sorted(all_directors.items(), key=lambda kv: kv[1], reverse=True)[:500]]
all_companies_dict = [key for key, value in sorted(all_companies.items(), key=lambda kv: kv[1], reverse=True)[:500]]

In [15]:
def create_one_hot_encoding_dict(data):
    dictionary = {data[i] : i for i in range(0, len(data))}
    return dictionary

In [16]:
cast_encoder = create_one_hot_encoding_dict(all_cast_dict)
directors_encoder = create_one_hot_encoding_dict(all_directors_dict)
companies_encoder = create_one_hot_encoding_dict(all_companies_dict)
genres_encoder = create_one_hot_encoding_dict(list(all_genres))

In [103]:
def encode_data(data, cast_encoder, directors_encoder, companies_encoder, genres_encoder):
    preprocessed_data = np.zeros(shape=(data.shape[0],len(cast_encoder)+len(directors_encoder)+len(companies_encoder)+len(genres_encoder)+4))
    for index, row in data.reset_index(drop=True).iterrows():
        cast = np.zeros((len(cast_encoder)))
        directors = np.zeros((len(directors_encoder)))
        companies = np.zeros((len(companies_encoder)))
        genres = np.zeros((len(genres_encoder)))
        for item in row['cast']:
            if item in cast_encoder:
                cast[cast_encoder.get(item)] = 1
        if row['director'] in directors_encoder:
            directors[directors_encoder.get(row['director'])] = 1
        for item in row['production_companies']:
            if item in companies_encoder:
                companies[companies_encoder.get(item)] = 1
        for item in row['genres']:
            genres[genres_encoder.get(item)] = 1
        budget = np.array(row['budget'])
        runtime = np.array(row['runtime'])
        year = np.array(row['release_year'])
        vote_average = np.array(row['vote_average'])
        preprocessed_data[index] = np.hstack((cast, directors, companies, genres, budget, runtime, year, vote_average))
    return preprocessed_data

In [104]:
preprocessed_data = encode_data(first_preprocessing, cast_encoder, directors_encoder, companies_encoder, genres_encoder)

In [105]:
preprocessed_data, preprocessed_data.shape

(array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 1.240e+02, 2.015e+03,
         6.500e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.200e+02, 2.015e+03,
         7.100e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.190e+02, 2.015e+03,
         6.300e+00],
        ...,
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+02, 1.966e+03,
         6.700e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 9.000e+01, 1.966e+03,
         6.100e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 7.400e+01, 1.966e+03,
         1.500e+00]]), (5021, 1524))

In [106]:
np.save('preprocessed_data_new.npy', preprocessed_data)

In [109]:
np.unique(preprocessed_data[0], return_counts=True)

(array([0.000e+00, 1.000e+00, 6.500e+00, 1.240e+02, 2.015e+03, 1.500e+08]),
 array([1511,    9,    1,    1,    1,    1], dtype=int64))

In [18]:
len(cast_encoder), len(directors_encoder), len(companies_encoder), len(genres_encoder)

(500, 500, 500, 20)

In [22]:
for feature in [1521, 1515, 1502, 1516,  311, 1514,  354,  445, 1508, 1051]:
    if 0 <= feature <= 499:
        print(list(cast_encoder.keys())[list(cast_encoder.values()).index(feature)])
    elif 500 <= feature <= 999:
        print(list(directors_encoder.keys())[list(directors_encoder.values()).index(feature-500)])
    elif 1000 <= feature <= 1499:
        print(list(companies_encoder.keys())[list(companies_encoder.values()).index(feature-1000)])
    elif 1500 <= feature <= 1519:
        print(list(genres_encoder.keys())[list(genres_encoder.values()).index(feature-1500)])
    elif feature == 1520:
        print('Budget')
    elif feature == 1521:
        print('Runtime')
    elif feature == 1522:
        print('Year')

Runtime
Drama
Science Fiction
Adventure
Christopher Lloyd
Horror
Charlie Sheen
Dan Hedaya
Documentary
DC Comics
