In [149]:
# imports
import pandas as pd
import  numpy as np
import ast
import json
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import ast

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

# read files and fix dates
train = pd.read_csv('C:/Users/or.kornboim/Documents/GitHub/Kaggle_TMDB_Box_Office_Prediction/Data/train.csv',
                    parse_dates=['release_date'],
                   index_col='id')
test = pd.read_csv('C:/Users/or.kornboim/Documents/GitHub/Kaggle_TMDB_Box_Office_Prediction/Data/test.csv',
                    parse_dates=['release_date'],
                   index_col='id')

# Date Fix
train['release_date'] = train['release_date'].mask(train['release_date'].dt.year > 2017, 
                                                   train['release_date'] - pd.offsets.DateOffset(years=100))
test['release_date'] = test['release_date'].mask(test['release_date'].dt.year > 2017, 
                                                   test['release_date'] - pd.offsets.DateOffset(years=100))
# dict columns
#dict_columns = ['belongs_to_collection', 'genres', 'production_companies', 'cast']

mlb = MultiLabelBinarizer()

df = test.copy()

In [150]:
def convert_string_to_list(strVal):
    if type(strVal) is not str:
        return  []
    else:
        return ast.literal_eval(strVal)
    
def format_dict_column_and_extract_names(strVal, col="name"):
    listOfItems = convert_string_to_list(strVal)
    
    return list(map(lambda x: x[col], listOfItems))

def add_x_length_column(df, col='cast'):
    castNames = df[col].apply(format_dict_column_and_extract_names)
    df[col + '_size'] = castNames.apply(lambda x: len(x))
    
    return df

def extract_genres(df):
    df['genres'] = df['genres'].apply(format_dict_column_and_extract_names)
    df = add_x_length_column(df, col='genres')
    
    return df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres')),
                                columns=list(map(lambda x: 'genre_'+x, mlb.classes_)),index=df.index))

def extract_common(df, col='production_companies', limit=30):
    df[col] = df[col].apply(format_dict_column_and_extract_names)
    companiesCount = df[col].apply(pd.Series).stack().value_counts()
    companiesToKeep = companiesCount[companiesCount > limit].keys()
    
    df = add_x_length_column(df, col)
    df[col] = df[col].apply(lambda x: list(filter(lambda i: i in companiesToKeep, x)))
    
    
    return df.join(pd.DataFrame(mlb.fit_transform(df.pop(col)),
                          columns=list(map(lambda x: col+'_'+x,mlb.classes_)),
                          index=df.index))

def last_year_metrics(df, agg_col='release_date_Year', calc_col=['revenue','imdb_id'], leg=1):
    temp = df.groupby([agg_col]).agg({calc_col[0]: 'mean', calc_col[1]: 'count'}).shift(leg).reset_index()
    temp = temp.rename(index=str, columns={'revenue':'last_period_revenue',
                                           'imdb_id': 'last_period_movie_count'})
    
    return pd.merge(df, temp, on=['release_date_Year'], how='left')

def days_diff_from_last_movie(df, col='release_date'):#see if there is a need in buckts
    df = df.sort_values(by=[col])
    df['days_diff_from_last_movie'] = df[col].diff().fillna(0)
    df['days_diff_from_last_movie'] = df['days_diff_from_last_movie'].apply(lambda x: x.days)

    return df

def ratios(df):
    df['budget_to_runtime'] = df['budget']/df['runtime']
    df['budget_to_cast_crew_size'] = df['budget']/(df['cast_size'] + df['crew_size'])
    
    return df

def add_datepart(df= df,col ='release_date'):
    df[col] = pd.to_datetime(df[col])
    df[col +'_Year']  = df[col].dt.year
    df[col +'_Month'] = df[col].dt.month
    df[col +'_day_of_month'] = df[col].dt.day
    df[col +'_week_day'] = df[col].dt.dayofweek
    df[col +'_is_quarter_start'] = df[col].dt.is_quarter_start
    
    return df    

df = extract_genres(train)
df = extract_common(df, col='production_companies', limit=30)
df = extract_common(df, col='production_countries', limit=20)
df = extract_common(df, col='spoken_languages', limit=10)
df = add_x_length_column(df, col='cast')
df = add_x_length_column(df, col='crew')
df = add_datepart(df= df,col ='release_date')
df = last_year_metrics(df)
df = days_diff_from_last_movie(df)
df = ratios(df)


df['belongs_to_collection_bool'] = np.where(df['belongs_to_collection'].isna(), 0, 1)
df.drop(['belongs_to_collection'], axis=1, inplace=True)

# counting the numbers of words in the movie title 
df['original_title_words'] = df.original_title.str.split()
df['original_title_words'] = df['original_title_words'].apply(lambda x: len(x)) 


df.head(3)




Unnamed: 0,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,release_date,runtime,status,tagline,title,Keywords,cast,crew,revenue,genres_size,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Foreign,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western,production_companies_size,production_companies_Canal+,production_companies_Columbia Pictures,production_companies_Columbia Pictures Corporation,production_companies_Metro-Goldwyn-Mayer (MGM),production_companies_Miramax Films,production_companies_New Line Cinema,production_companies_Paramount Pictures,production_companies_Regency Enterprises,production_companies_Relativity Media,production_companies_Touchstone Pictures,production_companies_TriStar Pictures,production_companies_Twentieth Century Fox Film Corporation,production_companies_United Artists,production_companies_Universal Pictures,production_companies_Village Roadshow Pictures,production_companies_Walt Disney Pictures,production_companies_Warner Bros.,production_countries_size,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_United Kingdom,production_countries_United States of America,spoken_languages_size,spoken_languages_,spoken_languages_Dansk,spoken_languages_Deutsch,spoken_languages_English,spoken_languages_Español,spoken_languages_Français,spoken_languages_Italiano,spoken_languages_Latin,spoken_languages_Magyar,spoken_languages_Polski,spoken_languages_Português,spoken_languages_Pусский,spoken_languages_Türkçe,spoken_languages_svenska,spoken_languages_Český,spoken_languages_ελληνικά,spoken_languages_עִבְרִית,spoken_languages_العربية,spoken_languages_हिन्दी,spoken_languages_தமிழ்,spoken_languages_ภาษาไทย,spoken_languages_广州话 / 廣州話,spoken_languages_日本語,spoken_languages_普通话,spoken_languages_한국어/조선말,cast_size,crew_size,release_date_Year,release_date_Month,release_date_day_of_month,release_date_week_day,release_date_is_quarter_start,last_period_revenue,last_period_movie_count,days_diff_from_last_movie,budget_to_runtime,budget_to_cast_crew_size,belongs_to_collection_bool,original_title_words
1763,250000,,tt0012349,en,The Kid,Considered one of Charlie Chaplin's best films...,8.17,/drgMcyTsySQBnUPGaBThCHGdlWT.jpg,1921-01-21,68.0,Released,6 reels of Joy.,The Kid,"[{'id': 290, 'name': 'angel'}, {'id': 1252, 'n...","[{'cast_id': 10, 'character': 'A Tramp', 'cred...","[{'credit_id': '52fe43269251416c75005605', 'de...",2500000,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35,12,1921,1,21,4,False,,,0,3676.47,5319.15,0,2
2992,1135654,,tt0015400,en,The Thief of Bagdad,A recalcitrant thief vies with a duplicitous M...,3.88,/a6IzXkwZRDimfn8HATzP6Pi6Ois.jpg,1924-03-18,149.0,Released,"""Happiness Must Be Earned""",The Thief of Bagdad,"[{'id': 255, 'name': 'male nudity'}, {'id': 14...","[{'cast_id': 3, 'character': 'The Thief of Bag...","[{'credit_id': '52fe45bec3a368484e06c70b', 'de...",1213880,0,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21,28,1924,3,18,1,False,2500000.0,1.0,1152,7621.84,23176.61,0,4
1917,592,,tt0016104,en,The Merry Widow,Prince Danilo falls in love with dancer Sally ...,0.29,/dCVkB0POblxtn3BegTNcwTPMKUP.jpg,1925-08-26,137.0,Released,,The Merry Widow,"[{'id': 1691, 'name': 'dance'}, {'id': 10181, ...","[{'cast_id': 2, 'character': ""Sally O'Hara"", '...","[{'credit_id': '57351170c3a36802410000d5', 'de...",1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,17,1925,8,26,2,False,1213880.0,1.0,526,4.32,23.68,0,3


In [116]:
import gensim
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem import Cistem
np.random.seed(42)

def lemmatize_stemming(text):
    stemmer = Cistem()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

df['overview'] = df['overview'].fillna('').astype(str).map(preprocess)
# df['overview'] = df['overview'].map(lambda big_list : [(x,big_list.count(x)) for x in  big_list])
df['overview'] = df['overview'].map(lambda x: list(set(x)))
list_of words = df['overview'].apply(pd.Series).stack().value_counts().sort_values(axis=0, ascending=False)   

In [158]:
df['overview'] = df['overview'].map(lambda x: (list(x)))
companiesCount = df['overview'].apply(pd.Series).stack().value_counts()