<a href="https://colab.research.google.com/github/Meatrean/MKT-440/blob/main/Group_1_Python_Group_Project_Netflix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
#https://community.dataquest.io/t/introduction-to-plotly-data-viz-library-netflix-dataset/558642

In [31]:
url = "https://raw.githubusercontent.com/Meatrean/MKT-440/main/netflix_titles.csv"
df = pd.read_csv(url)

In [32]:
df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [33]:
#type: Gives information about 2 different unique values one is TV Show and another is Movie
#title: Gives information about the title of Movie or TV Show
#director: Gives information about the director who directed the Movie or TV Show
#cast: Gives information about the cast who plays role in Movie or TV Show
#release_year: Gives information about the year when Movie or TV Show was released
#rating: Gives information about the Movie or TV Show are in which category (eg like the movies are only for students, or adults, etc)
#duration: Gives information about the duration of Movie or TV Show
#listed_in: Gives information about the genre of Movie or TV Show
#description: Gives information about the description of Movie or TV Show

In [34]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [35]:
df = df.dropna(how='any',subset=['cast', 'director'])
#.dropna function removes rows that contain null values in the
#cast and director columns.

In [36]:
df = df.dropna()
#This line of code is dropping the null vales
#Bob

In [37]:
df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
#These lines are converting some columns into proper date and time format.

In [38]:
df['season_count'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
df['duration'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)
#Values in duration columns that can be classified as seasons. 

In [39]:
df = df.rename(columns={"listed_in":"genre"})
df['genre'] = df['genre'].apply(lambda x: x.split(",")[0])
#Changing the listed_in column name to genre.

In [40]:
#In this part we could explain the training pieces the tutorial goes over.

In [41]:
fig_donut = px.pie(df, names='type', height=300, width=600, hole=0.7,
									 title='Most watched on Netflix',
									 color_discrete_sequence=['#b20710', '#221f1f'])
fig_donut.update_traces(hovertemplate=None, textposition='outside',
												textinfo='percent+label', rotation=90)
fig_donut.update_layout(margin=dict(t=100, b=30, l=0, r=0),
  				 						  showlegend=False,
	  	 									plot_bgcolor='#333', paper_bgcolor='#333',
			    							title_font=dict(size=45, color='#8a8d93',
													 family="Lato, sans-serif"),
											  font=dict(size=17, color='#8a8d93'),
											  hoverlabel=dict(bgcolor="#444", font_size=13,
													 font_family="Lato, sans-serif"))
#This shows that movies are preferred more than TV shows on Netflix. 

In [42]:
d1 = df[df["type"] == "TV Show"]
d2 = df[df["type"] == "Movie"]

col = "year_added"

vc1 = d1[col].value_counts().reset_index().rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)

vc2 = d2[col].value_counts().reset_index().rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)

trace1 = go.Scatter(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="orange"), )
trace2 = go.Scatter(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#b20710"))
data = [trace1, trace2]
fig_line = go.Figure(data)

fig_line.update_traces(hovertemplate=None)
fig_line.update_xaxes(showgrid=False)
fig_line.update_yaxes(showgrid=False)

large_title_format = 'Tv Show and Movies impact over the Year'
small_title_format = "<span style='font-size:13px; font-family:Tahoma'>Due to Covid creation of content is slowed."
fig_line.update_layout(title=large_title_format + "<br>" + small_title_format, height=400,
                  margin=dict(t=130, b=0, l=70, r=40),
                  hovermode="x unified", 
                  xaxis_title=' ', yaxis_title=" ",
                  plot_bgcolor='#333', paper_bgcolor='#333',
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5))

fig_line.add_annotation(dict(x=0.8, y=0.3, ax=0, ay=0,
                    xref = "paper", yref = "paper", 
                    text= "Highest number of <b>Tv Shows</b><br> were released in <b>2019</b><br> followed by 2017."
                  ))
fig_line.add_annotation(dict(x=0.9, y=1, ax=0, ay=0,
                    xref = "paper", yref = "paper",
                    text= "Highest number of <b>Movies</b> were relased<br> in <b>2019</b> followed by 2020"
                  ))
fig_line.show()

#Movies are dominating over TV shows in releases. 

In [43]:
df_month = pd.DataFrame(df.month_added.value_counts()).reset_index().rename(columns={'index':'month','month_added':'count'})

In [44]:
fig_month = px.funnel(df_month, x='count', y='month', title='Best month for releasing Content',
                      height=350, width=600, color_discrete_sequence=['#b20710'])

In [57]:
df_month = pd.DataFrame(df.month_added.value_counts()).reset_index().rename(columns={'index':'month','month_added':'count'})
df_month['month_final'] = df_month['month'].replace({1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'June', 7:'July', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'})

fig_month = px.funnel(df_month, x='count', y='month_final', title='Best month for releasing Content',
                      height=350, width=600, color_discrete_sequence=['#b20710'])
fig_month.update_xaxes(showgrid=False, ticksuffix=' ', showline=True)
fig_month.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_month.update_layout(margin=dict(t=60, b=20, l=70, r=40),
                        xaxis_title=' ', yaxis_title=" ",
                        plot_bgcolor='#333', paper_bgcolor='#333',
                        title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                        font=dict(color='#8a8d93'),
                        hoverlabel=dict(bgcolor="black", font_size=13, font_family="Lato, sans-serif"))
#Matt

In [58]:
df_country = df.groupby('year_added')['country'].value_counts().reset_index(name='counts')

fig = px.choropleth(df_country, locations="country", color="counts", 
                    locationmode='country names',
                    animation_frame='year_added',
                    title='Country Vs Year',
                    range_color=[0,200],
                    color_continuous_scale=px.colors.sequential.OrRd
                   )
fig.show()
#Jack

In [48]:
df_country = df.groupby('year_added')['country'].value_counts().reset_index(name='counts')

fig = px.choropleth(df_country, locations="country", color="counts", 
                    locationmode='country names',
                    title='Country',
                    range_color=[0,200],
                    color_continuous_scale=px.colors.sequential.OrRd
                   )
fig.show()
#This chart shows the number of shows watched in each country. 
#Jack

In [49]:
# making a copy of df
dff = df.copy()

# making 2 df one for tv show and another for movie with rating 
df_tv_show = dff[dff['type']=='TV Show'][['rating', 'type']].rename(columns={'type':'tv_show'})
df_movie = dff[dff['type']=='Movie'][['rating', 'type']].rename(columns={'type':'movie'})
df_movie = pd.DataFrame(df_movie.rating.value_counts()).reset_index().rename(columns={'index':'movie'})

df_tv_show = pd.DataFrame(df_tv_show.rating.value_counts()).reset_index().rename(columns={'index':'tv_show'})
df_tv_show['rating_final'] = df_tv_show['rating'] 
# making rating column value negative
df_tv_show['rating'] *= -1

# chart
fig = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_yaxes=True, horizontal_spacing=0)
# bar plot for tv shows
fig.append_trace(go.Bar(x=df_tv_show.rating, y=df_tv_show.tv_show, orientation='h', showlegend=True, 
                        text=df_tv_show.rating_final, name='TV Show', marker_color='#221f1f'), 1, 1)
# bar plot for movies
fig.append_trace(go.Bar(x=df_movie.rating, y=df_movie.movie, orientation='h', showlegend=True, text=df_movie.rating,
                        name='Movie', marker_color='#b20710'), 1, 2)
# styling the chart
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix=' ', showline=False)
fig.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig.update_layout(title='Which has the highest rating TV shows or Movies?',
                  margin=dict(t=80, b=0, l=70, r=40),
                  hovermode="y unified", 
                  xaxis_title=' ', yaxis_title=" ",
                  plot_bgcolor='#333', paper_bgcolor='#333',
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
                  hoverlabel=dict(bgcolor="black", font_size=13, font_family="Lato, sans-serif"))
fig.show()
#Jack

In [50]:
df_m = df[df['type']=='Movie']
df_m = pd.DataFrame(df_m['genre'].value_counts()).reset_index()

fig_bars = px.bar(df_m[:5], x='genre', y='index', text='index', 
                        title='Most preferd Genre for Movies',
                        color_discrete_sequence=['#b20710'])
fig_bars.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_bars.update_xaxes(visible=False)
fig_bars.update_yaxes(visible=False, categoryorder='total ascending')
fig_bars.update_layout(height=300,
                  margin=dict(t=100, b=20, l=70, r=40),
                  hovermode="y unified", 
                  plot_bgcolor='#333', paper_bgcolor='#333',
                  title_font=dict(size=40, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93', size=13))
#Jack

In [51]:
df_tv = df[df['type']=='TV Show']
df_tv = pd.DataFrame(df_tv['genre'].value_counts()).reset_index()

fig_tv = px.bar(df_tv[:5], x='genre', y='index', text='index',
                     title='Most preferred Genre for TV Shows',
                     color_discrete_sequence=['#b20710'])
fig_tv.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_tv.update_xaxes(visible=False)
fig_tv.update_yaxes(visible=False, categoryorder='total ascending')
fig_tv.update_layout(height=300,
                  margin=dict(t=100, b=20, l=70, r=40),
                  hovermode="y unified", 
                  plot_bgcolor='#333', paper_bgcolor='#333',
                  title_font=dict(size=40, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93', size=13))

fig_tv.show()
#Jack

In [52]:
d2 = df[df["type"] == "Movie"]
col = "year_added"

vc2 = d2[col].value_counts().reset_index().rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)

fig2 = go.Figure(go.Waterfall(
    name = "Movie", orientation = "v", 
    x = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021"],
    textposition = "auto",
    text = ["1", "2", "1", "13", "3", "6", "14", "48", "204", "743", "1121", "1366", "1228", "84"],
    y = [1, 2, -1, 13, -3, 6, 14, 48, 204, 743, 1121, 1366, -1228, -84],
    connector = {"line":{"color":"#b20710"}},
    increasing = {"marker":{"color":"#b20710"}},
    decreasing = {"marker":{"color":"orange"}},

))
fig2.update_xaxes(showgrid=False)
fig2.update_yaxes(showgrid=False, visible=False)
fig2.update_traces(hovertemplate=None)
fig2.update_layout(title='Produced Movies over time', height=350,
                   margin=dict(t=80, b=20, l=50, r=50),
                   hovermode="x unified",
                   xaxis_title=' ', yaxis_title=" ",
                   plot_bgcolor='#333', paper_bgcolor='#333',
                   title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                   font=dict(color='#8a8d93'))
#Jack