# 1. Ingest and Access Data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('disney_plus_titles.csv')

Take a quick look at the headings and data. Pull column names if desired.

In [2]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",Join Mickey and the gang as they duck the halls!
1,s2,Movie,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale",,"November 26, 2021",1988,PG,91 min,Comedy,Santa Claus passes his magic bag to a new St. ...
2,s3,Movie,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L...",United States,"November 26, 2021",2011,TV-G,23 min,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.
3,s4,Movie,The Queen Family Singalong,Hamish Hamilton,"Darren Criss, Adam Lambert, Derek Hough, Alexa...",,"November 26, 2021",2021,TV-PG,41 min,Musical,"This is real life, not just fantasy!"
4,s5,TV Show,The Beatles: Get Back,,"John Lennon, Paul McCartney, George Harrison, ...",,"November 25, 2021",2021,,1 Season,"Docuseries, Historical, Music",A three-part documentary from Peter Jackson ca...


In [3]:
col_list = df.columns.to_list()
col_list

['show_id',
 'type',
 'title',
 'director',
 'cast',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'description']

Evaluate the information in the dataframe. Start making a plan of what to do next.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       1450 non-null   object
 1   type          1450 non-null   object
 2   title         1450 non-null   object
 3   director      977 non-null    object
 4   cast          1260 non-null   object
 5   country       1231 non-null   object
 6   date_added    1447 non-null   object
 7   release_year  1450 non-null   int64 
 8   rating        1447 non-null   object
 9   duration      1450 non-null   object
 10  listed_in     1450 non-null   object
 11  description   1450 non-null   object
dtypes: int64(1), object(11)
memory usage: 136.1+ KB


In [5]:
df.describe()

Unnamed: 0,release_year
count,1450.0
mean,2003.091724
std,21.860162
min,1928.0
25%,1999.0
50%,2011.0
75%,2018.0
max,2021.0


Are there nulls or duplicates?

In [6]:
df.isna().sum()

show_id           0
type              0
title             0
director        473
cast            190
country         219
date_added        3
release_year      0
rating            3
duration          0
listed_in         0
description       0
dtype: int64

In [7]:
df.duplicated().sum()

0

# 2. Clean Data
Establish another dataframe to work from.

In [8]:
# Dropping the unecessary columns
df_clean = df.drop(['director','cast', 'country','description','show_id'], axis=1)
df_clean.head()

Unnamed: 0,type,title,date_added,release_year,rating,duration,listed_in
0,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"November 26, 2021",2016,TV-G,23 min,"Animation, Family"
1,Movie,Ernest Saves Christmas,"November 26, 2021",1988,PG,91 min,Comedy
2,Movie,Ice Age: A Mammoth Christmas,"November 26, 2021",2011,TV-G,23 min,"Animation, Comedy, Family"
3,Movie,The Queen Family Singalong,"November 26, 2021",2021,TV-PG,41 min,Musical
4,TV Show,The Beatles: Get Back,"November 25, 2021",2021,,1 Season,"Docuseries, Historical, Music"


## Adding a new 'streaming_platform' column

In [9]:
df_clean.insert(7, 'streaming_platform', 'DisneyPlus', allow_duplicates = True)
df_clean.head()

Unnamed: 0,type,title,date_added,release_year,rating,duration,listed_in,streaming_platform
0,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",DisneyPlus
1,Movie,Ernest Saves Christmas,"November 26, 2021",1988,PG,91 min,Comedy,DisneyPlus
2,Movie,Ice Age: A Mammoth Christmas,"November 26, 2021",2011,TV-G,23 min,"Animation, Comedy, Family",DisneyPlus
3,Movie,The Queen Family Singalong,"November 26, 2021",2021,TV-PG,41 min,Musical,DisneyPlus
4,TV Show,The Beatles: Get Back,"November 25, 2021",2021,,1 Season,"Docuseries, Historical, Music",DisneyPlus


In [10]:
#checking for null values
df_clean.isna().sum()

type                  0
title                 0
date_added            3
release_year          0
rating                3
duration              0
listed_in             0
streaming_platform    0
dtype: int64

# Separating into 2 dataframes for movies and TV_shows

In [11]:
#Dataframe 1
disneyplus_movies = df_clean.loc[df_clean['type']== 'Movie']
disneyplus_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1052 entries, 0 to 1449
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   type                1052 non-null   object
 1   title               1052 non-null   object
 2   date_added          1052 non-null   object
 3   release_year        1052 non-null   int64 
 4   rating              1051 non-null   object
 5   duration            1052 non-null   object
 6   listed_in           1052 non-null   object
 7   streaming_platform  1052 non-null   object
dtypes: int64(1), object(7)
memory usage: 74.0+ KB


In [12]:
disneyplus_movies.isna().sum()

type                  0
title                 0
date_added            0
release_year          0
rating                1
duration              0
listed_in             0
streaming_platform    0
dtype: int64

In [13]:
disneyplus_movies = disneyplus_movies.dropna(axis = 0) #dropping the na row 

In [14]:
disneyplus_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1051 entries, 0 to 1449
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   type                1051 non-null   object
 1   title               1051 non-null   object
 2   date_added          1051 non-null   object
 3   release_year        1051 non-null   int64 
 4   rating              1051 non-null   object
 5   duration            1051 non-null   object
 6   listed_in           1051 non-null   object
 7   streaming_platform  1051 non-null   object
dtypes: int64(1), object(7)
memory usage: 73.9+ KB


In [15]:
# Dataframe 2
disneyplus_tv = df_clean.loc[df_clean['type']== 'TV Show']
disneyplus_tv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 398 entries, 4 to 1441
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   type                398 non-null    object
 1   title               398 non-null    object
 2   date_added          395 non-null    object
 3   release_year        398 non-null    int64 
 4   rating              396 non-null    object
 5   duration            398 non-null    object
 6   listed_in           398 non-null    object
 7   streaming_platform  398 non-null    object
dtypes: int64(1), object(7)
memory usage: 28.0+ KB


In [16]:
disneyplus_tv.isna().sum()

type                  0
title                 0
date_added            3
release_year          0
rating                2
duration              0
listed_in             0
streaming_platform    0
dtype: int64

Imputing date_added with most_frequent values

In [17]:
from sklearn.impute import SimpleImputer # import the SimpleImputer class
cat_imputer = SimpleImputer(strategy='most_frequent') 
disneyplus_tv.loc[:,'date_added'] = cat_imputer.fit_transform(disneyplus_tv['date_added'].values.reshape(-1, 1)).ravel()

In [19]:
disneyplus_tv.isna().sum()

type                  0
title                 0
date_added            0
release_year          0
rating                2
duration              0
listed_in             0
streaming_platform    0
dtype: int64

In [20]:
disneyplus_tv = disneyplus_tv.dropna(axis = 0)

In [21]:
disneyplus_tv.isna().sum()

type                  0
title                 0
date_added            0
release_year          0
rating                0
duration              0
listed_in             0
streaming_platform    0
dtype: int64

In [22]:
disneyplus_tv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 396 entries, 6 to 1441
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   type                396 non-null    object
 1   title               396 non-null    object
 2   date_added          396 non-null    object
 3   release_year        396 non-null    int64 
 4   rating              396 non-null    object
 5   duration            396 non-null    object
 6   listed_in           396 non-null    object
 7   streaming_platform  396 non-null    object
dtypes: int64(1), object(7)
memory usage: 27.8+ KB


### Data Structure
- Add any necessary calculated or reformatted columns for visualization/analysis

In [23]:
#split duration columns
def slice_min(min):
    return int(min[:-4])
disneyplus_movies['duration_minutes'] = disneyplus_movies['duration'].apply(slice_min)

In [24]:
disneyplus_movies = disneyplus_movies.drop('duration', axis = 1)

In [25]:
disneyplus_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1051 entries, 0 to 1449
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   type                1051 non-null   object
 1   title               1051 non-null   object
 2   date_added          1051 non-null   object
 3   release_year        1051 non-null   int64 
 4   rating              1051 non-null   object
 5   listed_in           1051 non-null   object
 6   streaming_platform  1051 non-null   object
 7   duration_minutes    1051 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 73.9+ KB


### Checking for Erroneous Values


In [26]:
print(disneyplus_movies['type'].value_counts())
print(disneyplus_movies['title'].value_counts())
print(disneyplus_movies['date_added'].value_counts())
print(disneyplus_movies['release_year'].value_counts())
print(disneyplus_movies['rating'].value_counts())
print(disneyplus_movies['listed_in'].value_counts())
print(disneyplus_movies['streaming_platform'].value_counts())
print(disneyplus_movies['duration_minutes'].value_counts())

type
Movie    1051
Name: count, dtype: int64
title
Duck the Halls: A Mickey Mouse Christmas Special    1
Marvel Rising: Secret Warriors                      1
Marvel Studios: Expanding the Universe              1
Marvel Studios' Ant-Man                             1
Marvel Studios' Avengers: Age of Ultron             1
                                                   ..
Sea Scouts                                          1
The Boy Who Talked to Badgers                       1
The New Neighbor                                    1
The Small One                                       1
Captain Sparky vs. The Flying Saucers               1
Name: count, Length: 1051, dtype: int64
date_added
November 12, 2019    597
November 12, 2021     25
April 3, 2020         24
January 1, 2020       20
May 1, 2020           10
                    ... 
March 5, 2020          1
March 4, 2020          1
May 21, 2021           1
February 16, 2020      1
May 2, 2020            1
Name: count, Length: 134, dty

## Data Structure
Add any necessary calculated or reformatted columns for visualization/analysis

In [27]:
def slice_season(season):
    if len(season) <= 8:
        return (int(season[:-7]))
    else:
        return (int(season[:-8]))
disneyplus_tv['duration_season'] = disneyplus_tv['duration'].apply(slice_season)

In [28]:
disneyplus_tv = disneyplus_tv.drop('duration', axis = 1)

In [29]:
disneyplus_tv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 396 entries, 6 to 1441
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   type                396 non-null    object
 1   title               396 non-null    object
 2   date_added          396 non-null    object
 3   release_year        396 non-null    int64 
 4   rating              396 non-null    object
 5   listed_in           396 non-null    object
 6   streaming_platform  396 non-null    object
 7   duration_season     396 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 27.8+ KB


### Checking for Erroneous Values

In [30]:
print(disneyplus_tv['type'].value_counts())
print(disneyplus_tv['title'].value_counts())
print(disneyplus_tv['date_added'].value_counts())
print(disneyplus_tv['release_year'].value_counts())
print(disneyplus_tv['rating'].value_counts())
print(disneyplus_tv['listed_in'].value_counts())
print(disneyplus_tv['streaming_platform'].value_counts())
print(disneyplus_tv['duration_season'].value_counts())

type
TV Show    396
Name: count, dtype: int64
title
Hawkeye                                           1
Boy Meets World                                   1
Big Hero 6 The Series (Shorts)                    1
Big City Greens (Shorts)                          1
Best Friends Whenever                             1
                                                 ..
Wicked Tuna: Outer Banks                          1
Marvel Battleworld: Mystery of the Thanostones    1
The Muppet Show                                   1
Secrets of Sulphur Springs                        1
Imagination Movers                                1
Name: count, Length: 396, dtype: int64
date_added
November 12, 2019     128
May 1, 2020             9
September 18, 2020      8
August 14, 2020         8
January 1, 2020         8
                     ... 
July 14, 2021           1
July 16, 2021           1
July 10, 2020           1
July 3, 2020            1
October 1, 2019         1
Name: count, Length: 106, dtype: int64
r

## Export to .csv and complete in another program

In [31]:
tv_df = disneyplus_tv.to_csv('disneyplus_tv.csv', index = True)
movies_df = disneyplus_movies.to_csv('disneyplus_movies.csv', index = True)