# Movie Ratings Capstome

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import prepare_stephen
import datetime

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

import warnings
warnings.filterwarnings('ignore')

### Get Raw dirty data https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?select=links.csv

In [2]:
raw = prepare_stephen.get_dirty_data()

Using cached CSV


In [3]:
raw.head(2)

Unnamed: 0.1,Unnamed: 0,cast,crew,id,title,genres,budget,overview,popularity,production_companies,production_countries,revenue,runtime,vote_average,vote_count,keywords,release_date
0,0,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter JossWhedon AndrewStanton JoelCohe...,862,Toy Story,Animation Comedy Family,30000000,"Led by Woody, Andy's toys live happily in his ...",21.946943,PixarAnimationStudios,UnitedStatesofAmerica,373554033.0,81.0,7.7,5415.0,jealousy toy boy friendship friends rivalry bo...,1995-10-30
1,1,RobinWilliams JonathanHyde KirstenDunst Bradle...,LarryJ.Franco JonathanHensleigh JamesHorner Jo...,8844,Jumanji,Adventure Fantasy Family,65000000,When siblings Judy and Peter discover an encha...,17.015539,TriStarPictures TeitlerFilm InterscopeCommunic...,UnitedStatesofAmerica,262797249.0,104.0,6.9,2413.0,board game disappearance based on children's b...,1995-12-15


In [4]:
raw.shape

(7457, 17)

In [5]:
after_2000 = raw[raw.release_date > '2000-01-01']

In [6]:
after_2000.shape

(4586, 17)

#### Initial Data Check

In [7]:
# Nulls by Column

prepare_stephen.nulls_by_columns(raw)

Unnamed: 0,count,percent
Unnamed: 0,0,0.0
cast,34,0.004559
crew,13,0.001743
id,0,0.0
title,0,0.0
genres,23,0.003084
budget,0,0.0
overview,22,0.00295
popularity,0,0.0
production_companies,418,0.056055


In [8]:
# nulls by Rows

prepare_stephen.nulls_by_rows(raw)

n_missing  percent_missing
0          0.000000           6502
1          0.058824            684
2          0.117647            175
3          0.176471             73
4          0.235294             16
5          0.294118              3
6          0.352941              4
dtype: int64

In [9]:
# Summary 

prepare_stephen.data_summary(raw)

--- Shape: (7457, 17)
--- Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7457 entries, 0 to 7456
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            7457 non-null   int64  
 1   cast                  7423 non-null   object 
 2   crew                  7444 non-null   object 
 3   id                    7457 non-null   int64  
 4   title                 7457 non-null   object 
 5   genres                7434 non-null   object 
 6   budget                7457 non-null   int64  
 7   overview              7435 non-null   object 
 8   popularity            7457 non-null   float64
 9   production_companies  7039 non-null   object 
 10  production_countries  7301 non-null   object 
 11  revenue               7457 non-null   float64
 12  runtime               7451 non-null   float64
 13  vote_average          7457 non-null   float64
 14  vote_count            7457 non-null   flo

# Prepare Data

> ### Get prepared data
> - Handle missing values (Drop / fill by mean|mode)
> - Encode categorical data
> - Columns content split required:
    - cast
    - crew
    - production_companies
    - production_countries
> - Rename columns to ease readability
> - Drop duplicates & unnecessary columns
> - Convert release_date column from object data type to datetime format and index date
> - Data scaling
> - Feature engineering
> - Split the data into train, validate, test
> - 
> - 

> ### Split the data

In [10]:
train, validate, test = prepare_stephen.wrangle_movies_data(raw)

In [11]:
print(f'DataFrame Shapes\n\tTrain {train.shape}, Validate: {validate.shape}, Test: {test.shape}')

DataFrame Shapes
	Train (2568, 15), Validate: (1101, 15), Test: (918, 15)


# Explore Data

> ### Use train set

In [12]:
train.head(2)

Unnamed: 0_level_0,id,cast,crew,title,genres,budget,overview,popularity,production_companies,production_countries,revenue,runtime,vote_average,vote_count,keywords
release_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-06-01,268531,kevinhart edhelms thomasmiddleditch nickkroll ...,nicholasstoller davpilkey davidsoren robletterman,captain underpants: the first epic movie,action animation comedy family,38000000,two mischievous kids hypnotize their mean elem...,88.561239,dreamworksanimation scholasticentertainment,unitedstatesofamerica,110824373.0,89.0,6.5,159.0,underwear school principal grade school
2016-10-21,374473,davejohns hayleysquires dylanmckiernan brianas...,kenloach robbieryan jonathanmorris philippelog...,"i, daniel blake",drama,0,"a middle aged carpenter, who requires state we...",18.577504,whynotproductions wildbunch lesfilmsdufleuve b...,belgium france unitedkingdom,260354.0,100.0,7.7,264.0,heart attack single mother compassion carpente...


In [13]:
train.production_countries

release_date
2017-06-01                   unitedstatesofamerica
2016-10-21            belgium france unitedkingdom
2012-07-03                                 germany
2009-09-09                   unitedstatesofamerica
2013-03-01                   unitedstatesofamerica
                              ...                 
2008-04-30                   unitedstatesofamerica
2013-01-18                   unitedstatesofamerica
2009-10-15    hongkong japan unitedstatesofamerica
2004-10-22     unitedkingdom unitedstatesofamerica
2014-08-29                                   india
Name: production_countries, Length: 2568, dtype: object

> ### Univariate Exploration

> ### Bi-Variate Exploration

> ### Multi-Variate Exploration

> ### Visualizations

> ### Hypothesis testing

> ### Key exploration take-aways

# Modeling

### Test Model

## Conclusion & Recommendations