# Data Analysis & Data Presentation (Movies Dataset)

## Data Import and first Inspection

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.max_columns=30
pd.options.display.float_format='{:.2F}'.format

In [None]:
df=pd.read_csv('movies_complete.csv',parse_dates=['release_date'])


In [None]:
df

In [None]:
df.info()

In [None]:
df.genres[1]

In [None]:
df.cast[1]

In [None]:
df.describe()

In [None]:
df.hist(figsize=(20,12),bins=100)
plt.show()

In [None]:
df.budget_musd.value_counts(dropna=False).head(20)  #dropna means we are also including missing values

In [None]:
df.revenue_musd.value_counts(dropna=False).head(20)

In [None]:
df.vote_average.value_counts(dropna=False).head(20)

In [None]:
df.vote_count.value_counts(dropna=False)

In [None]:
df.describe(include='object')

In [None]:
df[df.title=="Cinderella"]

__2] Filter the Dataset and find the best/worst n Movies with the__
__Highest Revenue__
__Highest Budget__
__Highest Profit (=Revenue - Budget)__
__Lowest Profit (=Revenue - Budget)__
__Highest Return on Investment (=Revenue / Budget) (only movies with Budget >= 10)__
__Lowest Return on Investment (=Revenue / Budget) (only movies with Budget >= 10)__
__Highest number of Votes__
__Highest Rating (only movies with 10 or more Ratings)__
__Lowest Rating (only movies with 10 or more Ratings)__
__Highest Popularity__
__Define an appropriate user-defined function to reuse code__

In [None]:
from IPython.display import HTML

In [None]:
df_best=df[["poster_path","title","budget_musd","revenue_musd","vote_count","vote_average","popularity"]].copy()

In [None]:
df_best

In [None]:
df_best["profit_musd"]=df_best["revenue_musd"].sub(df_best["budget_musd"])
df_best["return"]=df_best["revenue_musd"].div(df_best["budget_musd"])

In [None]:
df_best

In [None]:
df_best.columns=["","Title","Budget","Revenue","Votes","Average Rating","Popularity","Profit","ROI"]

In [None]:
df_best


In [None]:
df_best.set_index("Title",inplace=True)

In [None]:
df_best

In [None]:
subset=df_best.iloc[:5,:2]
subset

In [None]:
HTML(subset.to_html(escape=False))

In [None]:
df_best.sort_values(by='Average Rating',ascending=False)

In [None]:
df_best.sort_values(by="ROI",ascending=False)

In [None]:
df_best.loc[df_best["Budget"]>=5].sort_values(by="ROI",ascending=False)

In [None]:
df_best.Budget.fillna(0,inplace=True) #fill missing values

In [None]:
df_best.Revenue.fillna(0,inplace=True)

In [None]:
df_best

In [None]:
df_best.info()

In [None]:
def best_worst(n,by,ascending=False,min_bud=0,min_vote=0):
    df2=df_best.loc[(df_best["Budget"]>=min_bud) & (df_best["Votes"]>=min_vote),["",by]].sort_values(by=by,ascending=ascending).head(n).copy()
    return HTML(df2.to_html(escape=False))

__Movies Top 5 - Highest Revenue__

In [None]:
best_worst(n=5,by="Revenue")

__Movies Top 5 - Highest Budget__

In [None]:
best_worst(n=5,by="Budget")

__Movies Top 5 - Highest Profit__

In [None]:
best_worst(n=5,by="Profit")

__Movies Top 5 - Lowest Profit__

In [None]:
best_worst(n=5,by="Profit",ascending=True)

__Movies Top 5 - Highest ROI__

In [None]:
best_worst(n=5,by="ROI",min_bud=50)

__Movies Top 5 - Lowest ROI__

In [None]:
best_worst(n=5,by="ROI",ascending=True,min_bud=100)

__Movies Top 5 - Most Votes__

In [None]:
best_worst(n=5,by="Votes")

__Movies Top 5 - Highest Rating__

In [None]:
best_worst(n=5,by="Average Rating",min_vote=50)

__Movies Top 5 - Lowest Rating__

In [None]:
best_worst(n=5,by="Average Rating",ascending=True,min_vote=100)

In [None]:
best_worst(n=5,by="Average Rating",ascending=True,min_vote=20,min_bud=20)

__Movies Top 5 - Most Popular__

In [None]:
best_worst(n=5,by="Popularity")

## Find your next Movie

3. __Filter__ the Dataset for movies that meet the following conditions:

__Search 1: Science Fiction Action Movie with Bruce Willis (sorted from high to low Rating)__

In [None]:
df["genres"][0]

In [None]:
mask_genres=df["genres"].str.contains("Action") & df["genres"].str.contains("Science Fiction")
mask_genres

In [None]:
df["cast"][0]

In [None]:
mask_actor=df["cast"].str.contains("Bruce Willis")
mask_actor

In [None]:
df.loc[mask_actor & mask_genres,["title","vote_average"]].sort_values(by="vote_average",ascending=False)

In [None]:
bruce=df.loc[mask_actor & mask_genres,["title","poster_path","vote_average"]].sort_values(by="vote_average",ascending=False)
bruce

In [None]:
HTML(bruce.to_html(escape=False))

__Search 2: Movies with Uma Thurman and directed by Quentin Tarantino (sorted from short to long runtime)__

In [None]:
df["director"]

In [None]:
mask_director=df["director"]=="Quentin Tarantino"
mask_director

In [None]:
mask_actor=df["cast"]=="Uma Thurman"
mask_actor

In [None]:
quentin=df.loc[mask_director & mask_actor,["title","poster_path","runtime"]].sort_values(by="runtime").set_index("title")
quentin

In [None]:
HTML(quentin.to_html(escape=False))

__Search 3: Most Successful Pixar Studio Movies between 2010 and 2015 (sorted from high to low Revenue)__

In [None]:
df["production_companies"][1]

In [None]:
mask_studio=df["production_companies"].str.contains("Pixar").fillna(True)
mask_studio

In [None]:
df["release_date"]

In [None]:
mask_date=df["release_date"].between("2010-01-01","2014-12-31")
mask_date

In [None]:
pixer=df.loc[mask_studio & mask_date,["title","poster_path","revenue_musd","release_date"]].sort_values(by="revenue_musd",ascending=False).set_index("title")
pixer

In [None]:
HTML(pixer.to_html(escape=False))

__Search 4: Action or Thriller Movie with original language English and minimum Rating of 7.5 (most recent movies first)__

In [None]:
mask_genre=df["genres"].str.contains("Action") | df["genres"].str.contains("Thriller")
mask_genre

In [None]:
mask_lan=df["original_language"]=="en"
mask_lan

In [None]:
mask_vote_av=df["vote_average"]>=7.5
mask_vote_av

In [None]:
mask_vote_co=df["vote_count"]>=10
mask_vote_co

In [None]:
next_mov=df.loc[mask_genre & mask_lan & mask_vote_av & mask_vote_co,["title","poster_path","genres","vote_average","vote_count","release_date"]].sort_values(by="release_date",ascending=False).set_index("title").head(20)
next_mov

In [None]:
HTML(next_mov.to_html(escape=False))

__what are the most common words in movies, title and tagline?__

In [None]:
# first install wordcloud in anaconda prompt
# pip install wordcloud
from wordcloud import WordCloud

In [None]:
df["tagline"][1]

In [None]:
df["overview"][1]

In [None]:
title=df.title.dropna() 
overview=df.overview.dropna()
tagline=df.tagline.dropna()

In [None]:
title

In [None]:
' '.join(title)

In [None]:
title_corpus=' '.join(title)
overview_corpus=' '.join(overview)
tagline_corpus=' '.join(tagline)

In [None]:
tagline_corpus

In [None]:
title_wordcloud=WordCloud(background_color="white",height=2000,width=4000,max_words=200).generate(title_corpus)
title_wordcloud

In [None]:
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud,interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
tagline_wordcloud=WordCloud(background_color='White',height=2000,width=4000).generate(tagline_corpus)
plt.figure(figsize=(16,8))
plt.imshow(tagline_wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
overview_wordcloud=WordCloud(background_color="White",height=2000,width=4000).generate(overview_corpus)
plt.figure(figsize=(16,8))
plt.imshow(overview_wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()

## Are Franchises more successful?

4. __Analyze__ the Dataset and __find out whether Franchises (Movies that belong to a collection) are more successful than stand-alone movies__ in terms of:

- mean revenue
- median Return on Investment
- mean budget raised
- mean popularity
- mean rating

In [None]:
df["belongs_to_collection"]

In [None]:
df["franchise"]=df["belongs_to_collection"].notna()  #finds whether movie belongs to collection or not

In [None]:
df["franchise"]

In [None]:
df["franchise"].value_counts() 
# False means movies belong to standalone(non-franchise), True means movies belong to franchise

__Franchise vs. Stand-alone: Average Revenue__

In [None]:
df.groupby("franchise").revenue_musd.mean()

__Franchise vs. Stand-alone: Return on Investment / Profitability (median)__

In [None]:
df["ROI"]=df.revenue_musd.div(df.budget_musd)

In [None]:
df.groupby("franchise").ROI.median()

__Franchise vs. Stand-alone: Average Budget__

In [None]:
df.groupby("franchise").budget_musd.mean()

__Franchise vs. Stand-alone: Average Popularity__

In [None]:
df.groupby("franchise").popularity.mean()

__Franchise vs. Stand-alone: Average Rating__

In [None]:
df.groupby("franchise").vote_average.mean()

In [None]:
df.groupby("franchise").agg({"budget_musd":"mean","revenue_musd":"mean","vote_average":"mean","popularity":"mean","ROI":"median","vote_count":"mean"})

## Most Successful Franchises

5. __Find__ the __most successful Franchises__ in terms of

- __total number of movies__
- __total & mean budget__
- __total & mean revenue__
- __mean rating__

In [None]:
df["belongs_to_collection"]

In [None]:
df["belongs_to_collection"].value_counts() #counts the no of movies in a collection

In [None]:
franchises=df.groupby("belongs_to_collection").agg({"title":"count","budget_musd":["sum","mean"],"revenue_musd":["sum","mean"],"vote_average":"mean","popularity":"mean","ROI":"median","vote_count":"mean"})


In [None]:
franchises

In [None]:
franchises.nlargest(20,("title","count"))  #finds collection which has title count value in top 20 


In [None]:
franchises.nlargest(20,("revenue_musd","sum"))

In [None]:
franchises.nlargest(20,("revenue_musd","mean"))

In [None]:
franchises.nlargest(20,("budget_musd","sum"))

In [None]:
franchises.nlargest(20,("budget_musd","mean"))

In [None]:
franchises[franchises[("vote_count","mean")]>=1000].nlargest(20,("vote_average","mean"))

## Most Successful Directors

6. __Find__ the __most successful Directors__ in terms of

- __total number of movies__
- __total revenue__
- __mean rating__

In [None]:
df["director"]

In [None]:
df.director.value_counts().head(20)  #total no of movies for each and every director 

In [None]:
plt.figure(figsize=(12,8))
df.director.value_counts().head(20).plot(kind="bar",fontsize=15)
plt.title("Most Active Directors",fontsize=20)
plt.ylabel("No of movies",fontsize=15)
plt.show()

In [None]:
df.groupby("director").revenue_musd.sum().nlargest(20) #group director and find revenue for each director grouped

In [None]:
plt.figure(figsize=(12,8))
df.groupby("director").revenue_musd.sum().nlargest(20).plot(kind="bar",fontsize=20)
plt.title("Total Revenue",fontsize=20)
plt.ylabel("Revenue (in musd)",fontsize=20)
plt.show()

In [None]:
dire=df.groupby("director").agg({"title":"count","vote_average":"mean","vote_count":"sum"})

In [None]:
dire

In [None]:
dire[(dire.vote_count>=10000) & (dire.title>=10)].nlargest(20,"vote_average")

In [None]:
df["genres"]=df["genres"].astype(str) #converts all elements including NaN to string

In [None]:
df.loc[df.genres.str.contains("Horror")] #filter all horror movies

In [None]:
#find successfull director by genres such as Horror
# group the horror movie by director and keep the revenue for each director
df.loc[df.genres.str.contains("Horror")].groupby("director").revenue_musd.sum().nlargest(20)

## Most succesful actors

In [None]:
df.cast

In [None]:
df.set_index("id",inplace=True)

In [None]:
df.info()

In [None]:
df.cast.str.split("|")

In [None]:
#expand=True will put the splitted string of cast column(actors) in new separate columns
act=df.cast.str.split("|",expand=True)
act
#one movie has 312 actors so 313 columns

In [None]:
#stack the presecribes levels from column to index
act.stack()  #it has created multiindex ie another index for all actors including id 

In [None]:
#reset_index we want to drop at level position 1 
#column id is at level position 0 --> we have removed the innder id column ie level position=1
act.stack().reset_index(level=1,drop=True)

In [None]:
#to_frame() converts it into pandas series dataframe
act.stack().reset_index(level=1,drop=True).to_frame()

In [None]:
act=act.stack().reset_index(level=1,drop=True).to_frame()
act  #on average, each and every movie has more than 10 actors

In [None]:
#rename the column label from 0 to actor
act.columns=["Actor"]

In [None]:
act

In [None]:
act=act.merge(df[["title","revenue_musd","vote_average","popularity"]],how="left",left_index=True,right_index=True)
#similar to vlookup in excel
#how="left" indicates we want left join or left merge
#movie id is a index in both dataframe ie act and df and we want to join both, so left_index and right_index both are True

In [None]:
act
#for each and every actor, we have one row belonging to that actor and movie id 
# previously all actors were places in single row of that movie id 

## Most successful actors

In [None]:
act

In [None]:
act.Actor.nunique()

In [None]:
act.Actor.unique()  #create an array of all unique actors

In [None]:
# most active actors
act.Actor.value_counts().head(20)

In [None]:
plt.figure(figsize=(12,6))
act.Actor.value_counts().head(20).plot(kind="bar",fontsize=15)
plt.title("Most active actors",fontsize=20)
plt.ylabel("Number of movies",fontsize=15)
plt.show()

In [None]:
agg=act.groupby("Actor").agg(Total_Revenue=("revenue_musd","sum"),Mean_Revenue=("revenue_musd","mean"),Mean_Rating=("vote_average","mean"),Mean_Pop=("popularity","mean"),Total_Movies=("Actor","count"))

In [None]:
agg

In [None]:
agg.nlargest(10,"Total_Movies")  #10 most active actors

In [None]:
#actors with highest total revenue 
agg.nlargest(10,"Total_Revenue")

In [None]:
plt.figure(figsize=(12,8))
agg["Total_Revenue"].nlargest(10).plot(kind="bar",fontsize=15)
plt.title("Total Revenue",fontsize=20)
plt.ylabel("Revenue(in MUSD)",fontsize=15)
plt.show()

In [None]:
agg["Mean_Revenue"].nlargest(20)

In [None]:
act[act["Actor"]=="Ashley Jeffery"]

In [None]:
#select those actors who acted in atleast more than 10 movies
agg[agg["Total_Movies"]>=10].nlargest(10,"Mean_Revenue")

In [None]:
agg[agg["Total_Movies"]>=10].nlargest(10,"Mean_Rating")

In [None]:
agg[agg["Total_Movies"]>=10].nlargest(10,"Mean_Pop")