# Data Analysis and Visualizations

## Import Packages and Read in Database as Pandas DataFrame
We take the cleaned database and read it as a dataframe by merging our three tables together on the link variable. We also expand on columns which originally contained lists by seperating the strings by our delimiter "%%%".

In [1]:
import os
import sqlite3
import pandas as pd

In [2]:
DATA_FOLDER = os.path.join('../data/clean/')
conn = sqlite3.connect(os.path.join(DATA_FOLDER, 'moviedatabase.db'))

In [3]:
top_df = (
    pd.read_sql('SELECT * FROM movies', conn)
        .merge(pd.read_sql('SELECT * FROM technicalities', conn), 
                left_on='link', 
                right_on='link', 
                how='left')
        .merge(pd.read_sql('SELECT * FROM post_release', conn), 
                left_on='link', 
                right_on='link', 
                how='left')
)

In [4]:
def crypted_to_list(list_value):
    return list_value.split("%%%")[1:]

In [5]:
columns_to_convert = ['Aspect Ratio', 'Director', 'Distributor', 'Genre', 
                      'Producer', 'Production Co', 'Rating', 'Screenwriter', 'Sound Mix']
for column in columns_to_convert:
    top_df[column] = top_df[column].apply(lambda x: crypted_to_list(x) if isinstance(x, str) else [])

In [6]:
top_df.head()

Unnamed: 0,link,title,Director,Genre,language,date-streaming,date-theater,Runtime,Rating,Aspect Ratio,Distributor,Sound Mix,Producer,Production Co,Screenwriter,rank,revenue,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,[Curtis Hanson],"[Crime, Drama]",English,"Dec 12, 2015",1997-09-19,136,[R],"[35mm, Scope (2.35:1)]","[Warner Home Vídeo, Warner Bros.]","[Surround, DTS, Dolby Digital]","[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]","[Curtis Hanson, James Ellroy, Brian Helgeland]",1,64600000.0,94,99
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,[Francis Ford Coppola],"[Crime, Drama]",English,"Aug 1, 2013",1972-03-15,177,[R],[Flat (1.85:1)],[Paramount Pictures],[Mono],[Albert S. Ruddy],[Paramount Pictures],"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",2,134800000.0,98,97
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,[Michael Curtiz],[Drama],English,"Aug 15, 2008",1943-01-23,102,[PG],[Flat (1.37:1)],[Warner Bros. Pictures],[Mono],[Hal B. Wallis],[Warner Brothers],"[Murray Burnett, Joan Alison, Julius J. Epstei...",3,,95,99
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,[Akira Kurosawa],[Action],Japanese,"Nov 29, 2011",1956-11-19,208,[],[Flat (1.37:1)],[Columbia Pictures],[Mono],[Sojiro Motoki],[Toho Company],"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",4,192900.0,97,100
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,[Bong Joon Ho],"[Comedy, Mystery & Thriller, Drama]",Korean,"Oct 11, 2019",2019-11-01,132,"[R , Sexual Content, Language, Some Violence]",[Scope (2.35:1)],[Neon],"[Dolby Atmos, Dolby Digital]","[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],"[Bong Joon Ho, Han Jinwon]",5,53400000.0,90,99


## Early Data Analysis
Play around and see any interesting trends or pieces of information

In [7]:
top_df.explode('Director')['Director'].value_counts()

Alfred Hitchcock    7
Akira Kurosawa      6
Billy Wilder        6
Steven Spielberg    5
Martin Scorsese     4
                   ..
Guy Hamilton        1
George Lucas        1
John Crowley        1
Jon Watts           1
Robert Wiene        1
Name: Director, Length: 228, dtype: int64

In [8]:
top_df.explode('Screenwriter')['Screenwriter'].value_counts()

Andrew Stanton        8
Pete Docter           6
Akira Kurosawa        6
Billy Wilder          6
Charlie Chaplin       4
                     ..
Guillermo del Toro    1
Robert Shaw           1
Howard Sackler        1
Carl Gottlieb         1
Hans Janowitz         1
Name: Screenwriter, Length: 528, dtype: int64

In [9]:
top_df.explode('Genre')['Genre'].value_counts()

Drama                 159
Comedy                 88
Adventure              61
Mystery & Thriller     51
Fantasy                41
Action                 38
Crime                  34
Kids & Family          29
Romance                27
Animation              26
History                22
Sci-Fi                 22
Horror                 15
War                    13
Biography              13
Musical                10
Western                 8
Holiday                 5
LGBTQ+                  5
Music                   4
Anime                   3
Sports                  1
Documentary             1
Name: Genre, dtype: int64

In [10]:
top_df.explode('Genre').groupby('Genre')['revenue'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Action,30.0,242369800.0,240015400.0,123.0,59600000.0,188650000.0,309225000.0,858400000.0
Adventure,46.0,234194000.0,215360000.0,123.0,33075000.0,218750000.0,317600000.0,858400000.0
Animation,21.0,176336400.0,134441400.0,494000.0,48000000.0,206400000.0,245900000.0,434000000.0
Anime,0.0,,,,,,,
Biography,13.0,69061050.0,90493910.0,6400.0,7900000.0,38900000.0,96900000.0,329700000.0
Comedy,57.0,106799400.0,153498500.0,17800.0,1800000.0,38400000.0,185500000.0,814100000.0
Crime,17.0,59304160.0,56860520.0,29600.0,7600000.0,42600000.0,124100000.0,164400000.0
Documentary,0.0,,,,,,,
Drama,100.0,47280930.0,63086110.0,6400.0,849700.0,20400000.0,62425000.0,329700000.0
Fantasy,36.0,224422300.0,230116700.0,123.0,5050000.0,218200000.0,335925000.0,858400000.0


In [11]:
top_df['critics_score'].describe()

count    300.000000
mean      96.546667
std        2.306271
min       89.000000
25%       95.000000
50%       97.000000
75%       98.000000
max      100.000000
Name: critics_score, dtype: float64

In [12]:
top_df['audience_score'].describe()

count    300.000000
mean      91.066667
std        3.711442
min       79.000000
25%       89.000000
50%       92.000000
75%       94.000000
max       99.000000
Name: audience_score, dtype: float64

In [13]:
# critics score more homogenous...

In [14]:
# wht types of movies are long?
top_df['Runtime'].describe()

count    300.000000
mean     117.700000
std       27.236914
min       60.000000
25%       97.750000
50%      113.000000
75%      131.000000
max      250.000000
Name: Runtime, dtype: float64

In [15]:
# For plotting purposes
import numpy as np
from lets_plot import *
LetsPlot.setup_html()

In [16]:
# This configures what shows up when you hover your mouse over the plot.
tooltip_setup = (
    layer_tooltips()
        .line('@category')
        .line('[@Q1 -- @median -- @Q3]')
        .format('@Q1', '£ {.2f}')
        .format('@median', '£ {.2f}')
        .format('@Q3', '£ {.2f}')
)

g = (
    # Maps the columns to the aesthetics of the plot.
    ggplot(plot_df, aes(y='category', x='median', xmin='Q1', xmax='Q3', fill='category')) +

    # GEOMS

    # Add a line range that 'listens to' columns informed in `ymin` and `ymax` aesthetics
    geom_linerange(size=1, alpha=0.75, tooltips=tooltip_setup) +

    # Add points to the plot (listen to `x` and `y` and fill aesthetics)
    geom_point(size=3, stroke=1, shape=21, tooltips=tooltip_setup) +

    # SCALES

    # Remove the legend (we can already read the categories from the y-axis)
    scale_fill_discrete(guide='none') +

    # Specify names for the axes
    scale_y_continuous(name="Discount Rate Categories\n(from smallest to largest rates)", expand=[0.05, 0.05]) +
    scale_x_log10(name="Price (£)", expand=[0., 0.05], format='£ {.2f}', breaks=np.arange(0, 20, 2.5)) +

    # LABELS
    labs(title='Products with lower prices tend to get bigger discounts',
         subtitle="The highest discounted products are the products with lower median prices.") +
    theme(axis_text_x=element_text(size=15),
        axis_text_y=element_text(size=17),
        axis_title_x=element_text(size=20),
        axis_title_y=element_text(size=20),
        plot_title=element_text(size=19, face='bold'),
        plot_subtitle=element_text(size=18),
        legend_position='none') +
    ggsize(1000, 500)

)

g


NameError: name 'plot_df' is not defined

In [None]:
scatter_plot = (
    ggplot(top_df, aes(x='audience_score', y='critics_score')) +
    geom_point() +
    scale_x_continuous(name='Audience Score', limits=[75, 100]) +
    scale_y_continuous(name='Critics Score', limits=[75, 100]) +
    labs(title='Scatter Plot of Audience vs Critics Score',
         x='Audience Score',
         y='Critics Score') +
    theme(
        axis_text_x=element_text(size=12),
        axis_text_y=element_text(size=12),
        axis_title_x=element_text(size=14),
        axis_title_y=element_text(size=14),
        plot_title=element_text(size=16, face='bold')
        
    ) + geom_jitter()
)

scatter_plot