<a href="https://colab.research.google.com/github/Idan707/Kaggle_TMDB_Box_Office_Prediction/blob/master/Kaggle_TMDB_Box_Office_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# # Clone the entire repo.
# !git clone -l -s git://github.com/Idan707/Kaggle_TMDB_Box_Office_Prediction.git cloned-repo
# %cd cloned-repo
# !ls

In [0]:
# imports
import pandas as pd
import  numpy as np
import ast
import json
import datetime
import ast
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

train = pd.read_csv('/content/cloned-repo/Data/train.csv',
                    parse_dates=['release_date'],
                   index_col='id')
test = pd.read_csv('/content/cloned-repo/Data/test.csv',
                    parse_dates=['release_date'],
                   index_col='id')

# Date Fix
train['release_date'] = train['release_date'].mask(train['release_date'].dt.year > 2017, 
                                                   train['release_date'] - pd.offsets.DateOffset(years=100))
test['release_date'] = test['release_date'].mask(test['release_date'].dt.year > 2017, 
                                                   test['release_date'] - pd.offsets.DateOffset(years=100))

mlb = MultiLabelBinarizer()

In [0]:
def convert_string_to_list(strVal):
    if type(strVal) is not str:
        return  []
    else:
        return ast.literal_eval(strVal)
    
def format_dict_column_and_extract_names(strVal, col="name"):
    listOfItems = convert_string_to_list(strVal)
    
    return list(map(lambda x: x[col], listOfItems))

def add_x_length_column(df, col):
    df[col + '_size'] = df[col].apply(lambda x: len(x))
    
    return df

def extract_genres(df):
    df['genres'] = df['genres'].apply(format_dict_column_and_extract_names)
    df = add_x_length_column(df, col='genres')
    df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres')), columns=list(map(lambda x: 'genre_'+x, mlb.classes_)),
                              index=df.index))
    
    return df

def extract_common(df, col, limit, apply_dict_to_col=True):
    if apply_dict_to_col == True:
        df[col] = df[col].apply(format_dict_column_and_extract_names)
        
    companiesCount = df[col].apply(pd.Series).stack().value_counts().sort_values(axis=0, ascending=False)
    companiesToKeep = companiesCount[:limit].keys()
    
    add_x_length_column(df, col)
    df[col] = df[col].apply(lambda x: list(filter(lambda i: i in companiesToKeep, x)))
    
    df = df.join(pd.DataFrame(mlb.fit_transform(df.pop(col)), columns=list(map(lambda x: col+'_'+x,mlb.classes_)),
                              index=df.index))
    return df

def last_year_metrics(df, agg_col='release_date_Year', calc_col=['revenue','imdb_id','popularity','budget'], leg=1):
    
    temp = df.groupby([agg_col]).agg(last_period_mean_revenue = (calc_col[0], 'mean'),
                                     last_period_median_revenue = (calc_col[0], 'median'),
                                     last_period_movies_count = (calc_col[1], 'count'),
                                     last_period_mean_popularity = (calc_col[2], 'mean'),
                                     last_period_median_popularity = (calc_col[2], 'median'),
                                     last_period_mean_budget = (calc_col[3], 'mean'),
                                     last_period_median_budget = (calc_col[3], 'median')).shift(leg).reset_index()

    return pd.merge(df, temp, on=['release_date_Year'], how='left')

def days_diff_from_last_movie(df, col='release_date'):#see if there is a need in buckts
    df = df.sort_values(by=[col])
    df['days_diff_from_last_movie'] = df[col].diff().fillna(0)
    
    return df

def ratios(df):
    df['budget_to_runtime'] = df['budget']/df['runtime']
    df['budget_to_cast_crew_size'] = df['budget']/(df['cast_size'] + df['crew_size'])
    
    df['budget_to_popularity'] = df['budget']/df['popularity']
    df['budget_to_year'] = df['budget']/(df['release_date_Year'])
    
    return df

def add_datepart(df,col ='release_date'):
    df[col] = pd.to_datetime(df[col])
    df[col +'_Year']  = df[col].dt.year
    df[col +'_Month'] = df[col].dt.month
    df[col +'_day_of_month'] = df[col].dt.day
    df[col +'_week_day'] = df[col].dt.dayofweek
    df[col +'_is_quarter_start'] = df[col].dt.is_quarter_start
    
    return df 

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
            
    return result

def extract_common_overview_words(df, col):
    df[col] = df[col].fillna('').astype(str).map(preprocess)
    df[col] = df[col].map(lambda x: list(set(x)))
    df = extract_common(df, col, limit=15, apply_dict_to_col=False)
    
    return df


df = extract_genres(train)

df = extract_common(df, col='production_companies', limit=30)
df = extract_common(df, col='production_countries', limit=20)
df = extract_common(df, col='spoken_languages', limit=15)
df = extract_common(df, col='Keywords', limit=15)
df = extract_common(df, col='cast', limit=15)
df = extract_common(df, col='crew', limit=15)

df = add_datepart(df,col = 'release_date') 
df = last_year_metrics(df)
df = days_diff_from_last_movie(df)
df = ratios(df)
df = extract_common_overview_words(df, col='overview')

df['belongs_to_collection_bool'] = np.where(df['belongs_to_collection'].isna(), 0, 1)
df.drop(['belongs_to_collection'], axis=1, inplace=True)

# counting the numbers of words in the movie title 
df['original_title_words'] = df.original_title.str.split()
df['original_title_words'] = df['original_title_words'].apply(lambda x: len(x)) 

# fill missing values
df['budget'] = np.where(df['budget'] == 0, df['last_period_median_budget'], df['budget'])
#df['revenue'] = np.where(df['budget'] == 0, df['last_period_median_budget'], df['budget'])


df.head(5)




Unnamed: 0,budget,homepage,imdb_id,original_language,original_title,popularity,poster_path,release_date,runtime,status,tagline,title,revenue,genres_size,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Foreign,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western,production_companies_size,production_companies_Amblin Entertainment,production_companies_BBC Films,production_companies_Canal+,production_companies_Columbia Pictures,production_companies_Columbia Pictures Corporation,production_companies_Dimension Films,production_companies_DreamWorks SKG,production_companies_Dune Entertainment,production_companies_Fox 2000 Pictures,production_companies_Fox Searchlight Pictures,production_companies_Hollywood Pictures,production_companies_Lionsgate,production_companies_Metro-Goldwyn-Mayer (MGM),production_companies_Miramax Films,production_companies_New Line Cinema,production_companies_Orion Pictures,production_companies_Paramount Pictures,production_companies_Regency Enterprises,production_companies_Relativity Media,production_companies_StudioCanal,production_companies_Summit Entertainment,production_companies_Touchstone Pictures,production_companies_TriStar Pictures,production_companies_Twentieth Century Fox Film Corporation,production_companies_United Artists,production_companies_Universal Pictures,production_companies_Village Roadshow Pictures,production_companies_Walt Disney Pictures,production_companies_Warner Bros.,production_companies_Working Title Films,production_countries_size,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Mexico,production_countries_Netherlands,production_countries_New Zealand,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_Sweden,production_countries_United Kingdom,production_countries_United States of America,spoken_languages_size,spoken_languages_,spoken_languages_Deutsch,spoken_languages_English,spoken_languages_Español,spoken_languages_Français,spoken_languages_Italiano,spoken_languages_Português,spoken_languages_Pусский,spoken_languages_العربية,spoken_languages_हिन्दी,spoken_languages_தமிழ்,spoken_languages_广州话 / 廣州話,spoken_languages_日本語,spoken_languages_普通话,spoken_languages_한국어/조선말,Keywords_size,Keywords_aftercreditsstinger,Keywords_based on novel,Keywords_biography,Keywords_duringcreditsstinger,Keywords_dystopia,Keywords_friendship,Keywords_independent film,Keywords_murder,Keywords_revenge,Keywords_sequel,Keywords_sex,Keywords_sport,Keywords_suspense,Keywords_violence,Keywords_woman director,cast_size,cast_Bill Murray,cast_Bruce McGill,cast_Bruce Willis,cast_Forest Whitaker,cast_J.K. Simmons,cast_John Turturro,cast_Keith David,cast_Liam Neeson,cast_Morgan Freeman,cast_Nicolas Cage,cast_Owen Wilson,cast_Robert De Niro,cast_Samuel L. Jackson,cast_Susan Sarandon,cast_Willem Dafoe,crew_size,crew_Avy Kaufman,crew_Bob Weinstein,crew_Deborah Aquila,crew_Francine Maisler,crew_Harvey Weinstein,crew_James Horner,crew_James Newton Howard,crew_Janet Hirshenson,crew_Jerry Goldsmith,crew_Kerry Barden,crew_Luc Besson,crew_Mary Vernieu,crew_Robert Rodriguez,crew_Steven Spielberg,crew_Tricia Wood,release_date_Year,release_date_Month,release_date_day_of_month,release_date_week_day,release_date_is_quarter_start,last_period_mean_revenue,last_period_median_revenue,last_period_movies_count,last_period_mean_popularity,last_period_median_popularity,last_period_mean_budget,last_period_median_budget,days_diff_from_last_movie,budget_to_runtime,budget_to_cast_crew_size,budget_to_popularity,budget_to_year,overview_size,overview_family,overview_father,overview_film,overview_finds,overview_friends,overview_help,overview_life,overview_love,overview_story,overview_time,overview_wife,overview_woman,overview_world,overview_years,overview_young,belongs_to_collection_bool,original_title_words
1763,250000.0,,tt0012349,en,The Kid,8.17,/drgMcyTsySQBnUPGaBThCHGdlWT.jpg,1921-01-21,68.0,Released,6 reels of Joy.,The Kid,2500000,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1921,1,21,4,False,,,,,,,,0 days,3676.47,5319.15,30605.54,130.14,28,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2
2992,1135654.0,,tt0015400,en,The Thief of Bagdad,3.88,/a6IzXkwZRDimfn8HATzP6Pi6Ois.jpg,1924-03-18,149.0,Released,"""Happiness Must Be Earned""",The Thief of Bagdad,1213880,5,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1924,3,18,1,False,2500000.0,2500000.0,1.0,8.17,8.17,250000.0,250000.0,1152 days,7621.84,23176.61,292806.4,590.26,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
1917,592.0,,tt0016104,en,The Merry Widow,0.29,/dCVkB0POblxtn3BegTNcwTPMKUP.jpg,1925-08-26,137.0,Released,,The Merry Widow,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1925,8,26,2,False,1213880.0,1213880.0,1.0,3.88,3.88,1135654.0,1135654.0,526 days,4.32,23.68,2064.74,0.31,27,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3
1361,1135654.0,,tt0015648,ru,Броненосец «Потёмкин»,12.91,/A5kk0FA4kS9sHLYzC6NI72OOhPc.jpg,1925-12-24,75.0,Released,"Revolution is the only lawful, equal, effectua...",Battleship Potemkin,45100,2,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1925,12,24,3,False,1213880.0,1213880.0,1.0,3.88,3.88,1135654.0,1135654.0,120 days,0.0,0.0,0.0,0.0,21,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2
2680,463455.0,,tt0017423,en,Sparrows,0.45,/3ZPfhlwZ9HIYViUCgoZusCszohT.jpg,1926-05-14,84.0,Released,,Sparrows,966878,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1926,5,14,4,False,22550.5,22550.5,2.0,6.6,6.6,296.0,296.0,141 days,5517.32,23172.75,1040242.32,240.63,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# TODO
1. Gender for cast and crew

In [0]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

In [0]:
url = urlopen('http://www.imdb.com/title/tt0018455').read()
soup = BeautifulSoup(url,"html.parser")

def get_budget(soup):
    try: 
        html_budget = (soup.findAll("h4", text="Budget:"))[0].parent
        return ((html_budget.text).split())[0]
    except: return ''
    
get_budget(soup)

''