In [None]:
#I import the libraries that I am going to use
import pandas as pd 
import psycopg2
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#I define the function to connect to my local movieDatabase server
def connectDF():
    conn = psycopg2.connect(
        host="localhost",
        database="movieDatabase",
        user="postgres",
        password="cat")
    return conn

In [None]:
#I establish the connection with my server
conn = connectDF()
print("Connection success.")

In [None]:
#Create a DataFrame from the box office revenue (in millions) for the actor: 'Leonardo DiCaprio'
df1 = pd.read_sql('''
        SELECT s.total_box_office / 1000000 as box_office, aid.actors
FROM sales as s
INNER JOIN movie_id as mid ON s.newurl = mid.movie_name
INNER JOIN has_actor as ha ON mid.movie_id = ha.movie_id
INNER JOIN actor_id as aid ON ha.actor_id = aid.actor_id
WHERE aid.actors = 'Leonardo DiCaprio'                      
        ''', conn)

In [None]:
#Create a DataFrame from the box office revenue (in millions) for the actor: 'Tom Cruise'
df2 = pd.read_sql('''
        SELECT s.total_box_office / 1000000 as box_office, aid.actors
FROM sales as s
INNER JOIN movie_id as mid ON s.newurl = mid.movie_name
INNER JOIN has_actor as ha ON mid.movie_id = ha.movie_id
INNER JOIN actor_id as aid ON ha.actor_id = aid.actor_id
WHERE aid.actors = 'Tom Cruise'                      
        ''', conn)

In [None]:
#Create a DataFrame from the box office revenue (in millions) for the actor: 'Scarlett Johannson'
df3 = pd.read_sql('''
        SELECT s.total_box_office / 1000000 as box_office, aid.actors
FROM sales as s
INNER JOIN movie_id as mid ON s.newurl = mid.movie_name
INNER JOIN has_actor as ha ON mid.movie_id = ha.movie_id
INNER JOIN actor_id as aid ON ha.actor_id = aid.actor_id
WHERE aid.actors = 'Scarlett Johansson'                      
        ''', conn)

In [None]:
#Create a DataFrame from the box office revenue (in millions) for the actor: 'Dwayne Johnson'
df4 = pd.read_sql('''
        SELECT s.total_box_office / 1000000 as box_office, aid.actors
FROM sales as s
INNER JOIN movie_id as mid ON s.newurl = mid.movie_name
INNER JOIN has_actor as ha ON mid.movie_id = ha.movie_id
INNER JOIN actor_id as aid ON ha.actor_id = aid.actor_id
WHERE aid.actors = 'Dwayne Johnson'                      
        ''', conn)

In [None]:
#Create a DataFrame from the box office revenue (in millions) for the actor: 'Jennifer Lawrence'
df5 = pd.read_sql('''
        SELECT s.total_box_office / 1000000 as box_office, aid.actors
FROM sales as s
INNER JOIN movie_id as mid ON s.newurl = mid.movie_name
INNER JOIN has_actor as ha ON mid.movie_id = ha.movie_id
INNER JOIN actor_id as aid ON ha.actor_id = aid.actor_id
WHERE aid.actors = 'Jennifer Lawrence'                      
        ''', conn)

In [None]:
#Create a DataFrame from the box office revenue of all other actors combined exluding the ones we already specified in their own DataFrame
df6 = pd.read_sql('''
        SELECT AVG(s.total_box_office / 1000000) as box_office, 'other actors' as actors
FROM sales as s
INNER JOIN movie_id as mid ON s.newurl = mid.movie_name
INNER JOIN has_actor as ha ON mid.movie_id = ha.movie_id
INNER JOIN actor_id as aid ON ha.actor_id = aid.actor_id
WHERE aid.actors NOT IN ('Leonardo DiCaprio', 'Tom Cruise', 'Scarlett Johansson', 'Dwayne Johnson', 'Jennifer Lawrence' ) AND s.total_box_office != 0
GROUP BY mid.movie_id
        ''', conn)

#This is the last data I needed from the postgres server so I close the connection.
conn.close()
print("Connection closed.")

In [None]:
#I merge all the DataFrames of each actor into one combined DataFrame
cdf = pd.concat([df1, df2, df3, df4, df5, df6])

In [None]:
#Here I create one boxplot per actor (or other actors) with on the Y-axis the box office 
plt.figure(figsize=(12, 6))
sns.boxplot(x="actors", y="box_office", data=cdf)
plt.xticks(rotation=45)
plt.xlabel('Actors')
plt.ylabel('Box office revenue in millions')
plt.title('Hypothesis 6: The star power of featuring actors has a positive influence on the box office revenue of a movie.')
plt.show()

In [None]:
#As you can see in the chart above the many outliers make it very difficult to read the boxplots. This is why I made the decision to remove the outliers.
plt.figure(figsize=(12, 6))
sns.boxplot(x="actors", y="box_office", data=cdf, showfliers=False)
plt.xticks(rotation=45)
plt.xlabel('Actors')
plt.ylabel('Box office revenue in millions')
plt.title('Hypothesis 6: The star power of featuring actors has a positive influence on the box office revenue of a movie.')
plt.show()