<h2><b>Netflix - TV Shows and Movies</b></h2>

This Jupyter Notebook was created based on a initial study's cicle of Data Science
and has as the main objective analyze the 'Netflix - TV Shows and Movies' dataset 
using Python Pandas and Plotly.

In [176]:
#Import libraries >>>
import pandas as pd
import plotly.express as px

In [177]:
#Importing the dataset >>>
df_titles = pd.read_csv('archive/titles.csv')
df_credits = pd.read_csv('archive/credits.csv')

In [178]:
# Show the titles dataset with display
display(df_titles.head(3))

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3


In [179]:
# Show the credits dataset with display
display(df_credits)

Unnamed: 0,person_id,id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR
...,...,...,...,...,...
77796,736339,tm1059008,Adelaida Buscato,María Paz,ACTOR
77797,399499,tm1059008,Luz Stella Luengas,Karen Bayona,ACTOR
77798,373198,tm1059008,Inés Prieto,Fanny,ACTOR
77799,378132,tm1059008,Isabel Gaona,Cacica,ACTOR


In [180]:
# Show the basic statistics 
display(df_titles.describe())

Unnamed: 0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
count,5850.0,5850.0,2106.0,5368.0,5352.0,5759.0,5539.0
mean,2016.417094,76.888889,2.162868,6.510861,23439.38,22.637925,6.829175
std,6.937726,39.002509,2.689041,1.163826,95820.47,81.680263,1.170391
min,1945.0,0.0,1.0,1.5,5.0,0.009442,0.5
25%,2016.0,44.0,1.0,5.8,516.75,2.7285,6.1
50%,2018.0,83.0,1.0,6.6,2233.5,6.821,6.9
75%,2020.0,104.0,2.0,7.3,9494.0,16.59,7.5375
max,2022.0,240.0,42.0,9.6,2294231.0,2274.044,10.0


<b>Relationship Between Genre and Score (TMDB and IMDB)</b>

<p>Objective: Discover if there is a correlation between genre and the perceived quality of the content.</p>

In [181]:
# The genres column can have more than one genre associated to one movie/tv-series, to solve this problem:

# Let's separate the genres
df_titles['genres'] = df_titles['genres'].str.strip('[]').str.replace("'", "").str.split(',')

#Apply the division to give one genre to each line
df_titles = df_titles.explode('genres')

df_titles['genres'] = df_titles['genres'].str.strip()

In [182]:
# Get the X and Y for bar chart
mean_scores_t = df_titles.groupby('genres')['tmdb_score'].mean().sort_values().round(2)
mean_scores_i = df_titles.groupby('genres')['imdb_score'].mean().sort_values().round(2)

x = mean_scores_t.index
y = mean_scores_t.values
z = mean_scores_i.values

In [183]:
# Creating the bar chart for TMDB and IMDB separated
import plotly.express as px

fig = px.bar(mean_scores_t, x, y, text_auto=True, title= 'Relationship Between Genre and Score (TMDB)')
fig.update_traces(marker_color='#363636')
fig.update_layout(xaxis_title='Genres', yaxis_title='Mean Score (TMDB)')

fig_2 = px.bar(mean_scores_i, x, z, text_auto=True,title='Relationship Between Genre and Score (IMDB)')
fig_2.update_traces(marker_color='#DC143C')
fig_2.update_layout(xaxis_title='Genres', yaxis_title='Mean Score (IMDB)')

fig.show()
fig_2.show()
# The chart is interactive, so github doesnt show it in preview

In [184]:
# Creating the bar chart for TMDB and IMDB together

mean_scores_t = df_titles.groupby('genres')['tmdb_score'].mean().sort_values().round(2)
mean_scores_i = df_titles.groupby('genres')['imdb_score'].mean().sort_values().round(2)

df_tmdb = mean_scores_t.reset_index()
df_tmdb['source'] = 'TMDB'
df_tmdb = df_tmdb.rename(columns={'tmdb_score': 'score'})

df_imdb = mean_scores_i.reset_index()
df_imdb['source'] = 'IMDB'
df_imdb = df_imdb.rename(columns={'imdb_score': 'score'})

combined_scores = pd.concat([df_tmdb, df_imdb])
#      genres  score source
# 0     drama   6.95   TMDB
# 1    comedy   7.63   TMDB
# 2    action   8.35   TMDB
# 0     drama   6.80   IMDB
# 1    comedy   7.13   IMDB
# 2    action   8.15   IMDB

fig = px.bar(
    combined_scores,
    x='genres',
    y='score',
    color='source', 
    barmode='group', 
    text_auto=True,
    title='Relationship Between Genre and Score (TMDB and IMDB)',
    color_discrete_map={'TMDB': '#363636', 'IMDB': '#DC143C'}
)

fig.update_layout(xaxis_title='Genres', yaxis_title='Mean Score')

# The chart is interactive, so github doesnt show it in preview


<b>Analysis of Top 10 Content by imdb_score</b>

<p>Objective: Identify the highest-rated movies and series of all time.</p>


In [185]:
df_unique_titles = df_titles.drop_duplicates(subset=['title'], keep='first')
df_sorted = df_unique_titles.sort_values(by='imdb_score', ascending=False)
df_top_5 = df_sorted.head(5)

fig = px.bar(df_top_5, 
             x= 'title',
             y= 'imdb_score', 
             text_auto=True,
             title='Highest-rated movies and series of all time')

fig.update_layout(yaxis_title = 'IMDB Score', xaxis_title = 'Title')
fig.update_traces(marker_color = '#701010')
fig.show()

<b>Top 5 Countries by Film and TV Series Production</b>
<p>Objective: To analyze and rank the top 5 countries by film and TV series production.</p>

In [186]:
# Let's use the same tecnique as the genres, but to separate the countries
df_titles['production_countries'] = df_titles['production_countries'].str.strip('[]').str.replace("'", "").str.split(',')

df_titles = df_titles.explode('production_countries')

df_titles['production_countries'] = df_titles['production_countries'].str.strip()

In [187]:
# Creating the pie chart
df_new = df_titles.groupby('production_countries')['title'].count().sort_values(ascending=False).head(5)

value = df_new.values
name = df_new.index

fig = px.pie(df_new, values=value, names= name, title='Top 5 Countries by Film and TV Series Production', labels={'names': name})
fig.update_traces(textinfo= 'percent+label')
fig.show()

<b>Relationship Between Movie Runtime and Number of Votes</b>

<p>Objective: See if longer or shorter movies receive more audience ratings.</p>

In [188]:
# Gerate the scatter chart
df_votes = df_titles.drop_duplicates(subset=['title'], keep='first')

fig = px.scatter(df_sorted, x='runtime', y='imdb_votes', title='Relationship Between Movie Runtime and Number of Votes')
fig.update_layout(yaxis_title= 'IMDB Votes', xaxis_title='Runtime')
fig.update_traces(marker_color = "#A74D89")
fig.show()

<b>TV-Show vs Movie Counter and Top 10 Actors - Producers and Actors Analysis</b>

<p>Objective: Understand the proportion between Tv-Show and Movie productions and what actors most appears in the dataset</p>

In [189]:
#Pies Charts

import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_actor = df_credits.groupby('name')['id'].count().sort_values(ascending=False).head(10)
label = df_actor.index
value = df_actor.values
values = df_titles.groupby('type')['title'].count().values


fig = make_subplots(
    rows=1, cols=2,
    specs=[[{'type':'domain'}, {'type':'domain'}]],
    subplot_titles=['TV-Show vs Movie Counter', 'Top 10 Actors'])


fig.add_trace(go.Pie(
    labels=df_titles['type'],
    values=values,
    name="Production Type", 
    textinfo='label+percent',
    legendgroup='group1', 
    marker_colors = ["#6928BE",'#363636']
), 1, 1)

fig.add_trace(go.Pie(
    labels=label,
    values=value,
    name="Actors",
    legendgroup='group2'
), 1, 2)


fig.update_layout(
    title_text="Producers and Actors Analysis",
    showlegend=True,
    legend=dict(
        x=1,
        y=0.5,
        traceorder='grouped',
        title_text='Tv - Movies - Actors'
    )
)

fig.show()