In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.offline as plt

# pyo.init_notebook_model()
import plotly.express as px

%matplotlib inline

In [7]:
df = pd.read_csv("movie streams.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


Rotten Tomatoes is an American review-aggregation website for film and television.

In [8]:
cols = df.columns.tolist()
cols

['Unnamed: 0',
 'ID',
 'Title',
 'Year',
 'Age',
 'IMDb',
 'Rotten Tomatoes',
 'Netflix',
 'Hulu',
 'Prime Video',
 'Disney+',
 'Type',
 'Directors',
 'Genres',
 'Country',
 'Language',
 'Runtime']

In [21]:
df.drop(['Unnamed: 0','ID',], axis=1, inplace = True)
cols = df.columns.tolist()
cols

KeyError: "['Unnamed: 0', 'ID'] not found in axis"

In [22]:
df.isna().sum()

Title                  0
Year                   0
Age                 9390
IMDb                 571
Rotten Tomatoes    11586
Netflix                0
Hulu                   0
Prime Video            0
Disney+                0
Type                   0
Directors            726
Genres               275
Country              435
Language             599
Runtime              592
dtype: int64

As we can see, there are missing values in the some of the columns. We will not drop these coplumns for now.

NOTE: Dropping a value in your dataset should be the last option to consider. There are better ways to deal with missing values in your dataset.

There is a detail and well explained tutorial on dealing with missing values and other featurung engineering techniques.

#### Let's remove the '+' sign attached to the values in the Age column.

In [24]:
df['Age']

0        13+
1        18+
2        13+
3         7+
4        18+
        ... 
16739    NaN
16740     7+
16741    NaN
16742    NaN
16743    NaN
Name: Age, Length: 16744, dtype: object

In [25]:
age_map = {'18+' : 18, '7+' : 7, '13': 13, 'all' : 0, '16+' : 16 }
df['AgeCopy'] = df['Age'].map(age_map)
df['AgeCopy']

0         NaN
1        18.0
2         NaN
3         7.0
4        18.0
         ... 
16739     NaN
16740     7.0
16741     NaN
16742     NaN
16743     NaN
Name: AgeCopy, Length: 16744, dtype: float64

#### Let's remove the '%' sign attached to the values in the Rotten Tomatoes column.

In [27]:
df['New_Rotten_Tomamtoes'] = df['Rotten Tomatoes'].str.replace("%", "")
for i in df['New_Rotten_Tomamtoes']:
    if i == str:
        i.astype(int)
df['New_Rotten_Tomamtoes']

0         87
1         87
2         84
3         96
4         97
        ... 
16739    NaN
16740    NaN
16741    NaN
16742    NaN
16743    NaN
Name: New_Rotten_Tomamtoes, Length: 16744, dtype: object

# **Visualisations**

### What is the number if movies for each age group

In [28]:
pip install plotly==4.14.1

Collecting plotly==4.14.1
  Downloading plotly-4.14.1-py2.py3-none-any.whl (13.2 MB)
                                              0.0/13.2 MB ? eta -:--:--
                                              0.0/13.2 MB 1.3 MB/s eta 0:00:11
                                              0.0/13.2 MB 1.3 MB/s eta 0:00:11
                                             0.1/13.2 MB 930.9 kB/s eta 0:00:15
                                             0.1/13.2 MB 798.9 kB/s eta 0:00:17
                                             0.2/13.2 MB 702.7 kB/s eta 0:00:19
                                             0.2/13.2 MB 731.4 kB/s eta 0:00:18
                                             0.2/13.2 MB 687.0 kB/s eta 0:00:19
                                             0.3/13.2 MB 952.6 kB/s eta 0:00:14
     -                                       0.4/13.2 MB 890.4 kB/s eta 0:00:15
     -                                       0.4/13.2 MB 834.5 kB/s eta 0:00:16
     -                                       


[notice] A new release of pip is available: 23.1.2 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
!pip install "notebook>=5.3" "ipywidgets>=7.2"


Collecting notebook>=5.3
  Downloading notebook-6.5.4-py3-none-any.whl (529 kB)
                                              0.0/529.8 kB ? eta -:--:--
     --                                       30.7/529.8 kB ? eta -:--:--
     -----                                 71.7/529.8 kB 787.7 kB/s eta 0:00:01
     -----                                 71.7/529.8 kB 787.7 kB/s eta 0:00:01
     -------------                          194.6/529.8 kB 1.1 MB/s eta 0:00:01
     -------------                          194.6/529.8 kB 1.1 MB/s eta 0:00:01
     ----------------------                 307.2/529.8 kB 1.1 MB/s eta 0:00:01
     ----------------------                 307.2/529.8 kB 1.1 MB/s eta 0:00:01
     ---------------------------            389.1/529.8 kB 1.1 MB/s eta 0:00:01
     ---------------------------            389.1/529.8 kB 1.1 MB/s eta 0:00:01
     ---------------------------            389.1/529.8 kB 1.1 MB/s eta 0:00:01
     ---------------------------            389.1/529

In [36]:
!pip install jupyterlab "ipywidgets>=7.5"


Collecting jupyterlab
  Downloading jupyterlab-4.0.3-py3-none-any.whl (9.2 MB)
                                              0.0/9.2 MB ? eta -:--:--
                                              0.0/9.2 MB 1.4 MB/s eta 0:00:07
                                              0.1/9.2 MB 1.1 MB/s eta 0:00:09
                                              0.1/9.2 MB 950.9 kB/s eta 0:00:10
     -                                        0.2/9.2 MB 1.2 MB/s eta 0:00:08
     -                                        0.3/9.2 MB 1.3 MB/s eta 0:00:07
     -                                        0.4/9.2 MB 1.4 MB/s eta 0:00:07
     --                                       0.5/9.2 MB 1.5 MB/s eta 0:00:06
     --                                       0.6/9.2 MB 1.5 MB/s eta 0:00:06
     --                                       0.7/9.2 MB 1.6 MB/s eta 0:00:06
     ---                                      0.8/9.2 MB 1.6 MB/s eta 0:00:06
     ---                                      0.9/9.2 MB 1.7 MB/s et