In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

This dataset contains 7398 movies and a variety of metadata obtained from The Movie Database (TMDB). Movies are labeled with id. Data points include cast, crew, plot keywords, budget, posters, release dates, languages, production companies, and countries.

The data set has a lot of Nan values, this project includes data cleaning and feature engineering.
Based on regression techniques, the goal is to predict the worldwide revenue for 4398 movies.

**Column description**

id: Integer unique id of each movie

belongs_to_collection: Contains the TMDB Id, Name, Movie Poster, and Backdrop URL of a movie in JSON format.

budget: Budget of a movie in dollars. Some row contains 0 values, which mean unknown.

genres: Contains all the Genres Name & TMDB Id in JSON Format.

homepage: Contains the official URL of a movie.

imdb_id: IMDB id of a movie (string).

original_language: Two-digit code of the original language, in which the movie was made.

original_title: The original title of a movie in original_language.

overview: Brief description of the movie.

popularity: Popularity of the movie.

poster_path: Poster path of a movie. You can see full poster image by adding URL after this link → 

https://image.tmdb.org/t/p/original/

production_companies: All production company name and TMDB id in JSON format of a movie.

production_countries: Two-digit code and the full name of the production company in JSON format.

release_date: The release date of a movie in mm/dd/yy format.

runtime: Total runtime of a movie in minutes (Integer).

spoken_languages: Two-digit code and the full name of the spoken language.

status: Is the movie released or rumored?

tagline: Tagline of a movie

title: English title of a movie

Keywords: TMDB Id and name of all the keywords in JSON format.

cast: All cast TMDB id, name, character name, gender (1 = Female, 2 = Male) in JSON format

crew: Name, TMDB id, profile path of various kind of crew members job like Director, Writer, Art, Sound, etc.

revenue: Total revenue earned by a movie in dollars.

**Data loading**

In [70]:
data = pd.read_csv('train.csv')

In [71]:
df = data.copy()

**Data inspecting**

In [85]:
#The 'ravenue' column is our target column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3000 non-null   int64  
 1   belongs_to_collection  604 non-null    object 
 2   budget                 3000 non-null   int64  
 3   genres                 2993 non-null   object 
 4   homepage               946 non-null    object 
 5   imdb_id                3000 non-null   object 
 6   original_language      3000 non-null   object 
 7   original_title         3000 non-null   object 
 8   overview               2992 non-null   object 
 9   popularity             3000 non-null   float64
 10  poster_path            2999 non-null   object 
 11  production_companies   2844 non-null   object 
 12  production_countries   2945 non-null   object 
 13  release_date           3000 non-null   object 
 14  runtime                2998 non-null   float64
 15  spok

In [73]:
df.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


In [74]:
df.describe(include='all')

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
count,3000.0,604,3000.0,2993,946,3000,3000,3000,2992,3000.0,...,3000,2998.0,2980,3000,2403,3000,2724,2987,2984,3000.0
unique,,422,,872,941,3000,36,2975,2992,,...,2398,,401,2,2400,2969,2648,2975,2984,
top,,"[{'id': 645, 'name': 'James Bond Collection', ...",,"[{'id': 18, 'name': 'Drama'}]",http://www.transformersmovie.com/,tt0099697,en,Priest,An airline pilot and his wife are forced to fa...,,...,9/10/15,,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Based on a true story.,Stolen,"[{'id': 10183, 'name': 'independent film'}]",[],"[{'credit_id': '5635ec3092514129fe00c2f5', 'de...",
freq,,16,,266,4,1,2575,2,1,,...,5,,1817,2996,3,2,27,13,1,
mean,1500.5,,22531330.0,,,,,,,8.463274,...,,107.856571,,,,,,,,66725850.0
std,866.169729,,37026090.0,,,,,,,12.104,...,,22.086434,,,,,,,,137532300.0
min,1.0,,0.0,,,,,,,1e-06,...,,0.0,,,,,,,,1.0
25%,750.75,,0.0,,,,,,,4.018053,...,,94.0,,,,,,,,2379808.0
50%,1500.5,,8000000.0,,,,,,,7.374861,...,,104.0,,,,,,,,16807070.0
75%,2250.25,,29000000.0,,,,,,,10.890983,...,,118.0,,,,,,,,68919200.0


**Missing data**

In [75]:
df.isnull().sum().sort_values(ascending=False)

belongs_to_collection    2396
homepage                 2054
tagline                   597
Keywords                  276
production_companies      156
production_countries       55
spoken_languages           20
crew                       16
cast                       13
overview                    8
genres                      7
runtime                     2
poster_path                 1
original_language           0
budget                      0
imdb_id                     0
revenue                     0
original_title              0
popularity                  0
release_date                0
status                      0
title                       0
id                          0
dtype: int64

**Correlation between target and numerical columns**

In [76]:
df.corr()

Unnamed: 0,id,budget,popularity,runtime,revenue
id,1.0,0.019732,-0.00747,0.01075,0.00061
budget,0.019732,1.0,0.342356,0.238373,0.752965
popularity,-0.00747,0.342356,1.0,0.13369,0.46146
runtime,0.01075,0.238373,0.13369,1.0,0.21638
revenue,0.00061,0.752965,0.46146,0.21638,1.0
