In [1]:
import os
import sqlite3
import pandas as pd

In [2]:
DATA_FOLDER = os.path.join('../data/clean/')
conn = sqlite3.connect(os.path.join(DATA_FOLDER, 'moviedatabase.db'))

In [3]:
top_df = (
    pd.read_sql('SELECT * FROM movies', conn)
        .merge(pd.read_sql('SELECT * FROM technicalities', conn), 
                left_on='link', 
                right_on='link', 
                how='left')
        .merge(pd.read_sql('SELECT * FROM post_release', conn), 
                left_on='link', 
                right_on='link', 
                how='left')
)

In [4]:
def crypted_to_list(list_value):
    return list_value.split("%%%")[1:]

In [5]:
columns_to_convert = ['Aspect Ratio', 'Director', 'Distributor', 'Genre', 
                      'Producer', 'Production Co', 'Rating', 'Screenwriter', 'Sound Mix']
for column in columns_to_convert:
    top_df[column] = top_df[column].apply(lambda x: crypted_to_list(x) if isinstance(x, str) else [])

In [6]:
top_df.head()

Unnamed: 0,link,title,Director,Genre,language,date-streaming,date-theater,Runtime,Rating,Aspect Ratio,Distributor,Sound Mix,Producer,Production Co,Screenwriter,rank,revenue,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,[Curtis Hanson],"[Crime, Drama]",English,"Dec 12, 2015",1997-09-19,136,[R],"[35mm, Scope (2.35:1)]","[Warner Home Vídeo, Warner Bros.]","[Surround, DTS, Dolby Digital]","[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]","[Curtis Hanson, James Ellroy, Brian Helgeland]",1,64600000.0,94.0,99.0
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,[Francis Ford Coppola],"[Crime, Drama]",English,"Aug 1, 2013",1972-03-15,177,[R],[Flat (1.85:1)],[Paramount Pictures],[Mono],[Albert S. Ruddy],[Paramount Pictures],"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",2,134800000.0,98.0,97.0
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,[Michael Curtiz],[Drama],English,"Aug 15, 2008",1943-01-23,102,[PG],[Flat (1.37:1)],[Warner Bros. Pictures],[Mono],[Hal B. Wallis],[Warner Brothers],"[Murray Burnett, Joan Alison, Julius J. Epstei...",3,,95.0,99.0
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,[Akira Kurosawa],[Action],Japanese,"Nov 29, 2011",1956-11-19,208,[],[Flat (1.37:1)],[Columbia Pictures],[Mono],[Sojiro Motoki],[Toho Company],"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",4,192900.0,97.0,100.0
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,[Bong Joon Ho],"[Comedy, Mystery & Thriller, Drama]",Korean,"Oct 11, 2019",2019-11-01,132,"[R , Sexual Content, Language, Some Violence]",[Scope (2.35:1)],[Neon],"[Dolby Atmos, Dolby Digital]","[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],"[Bong Joon Ho, Han Jinwon]",5,53400000.0,90.0,99.0


In [218]:
# For plotting purposes
import numpy as np
from lets_plot import *
LetsPlot.setup_html()

In [219]:

# Group by 'category' and calculate statistics

summary_stats = top_df[top_df['revenue'].notna()].groupby('language')['revenue'].describe(percentiles=[.25, .5, .75])

# Create plot_df with required columns
plot_df = pd.DataFrame({
    'category': summary_stats.index,
    'count': summary_stats['count'],
    'mean': summary_stats['mean'],
    'std': summary_stats['std'],
    'min': summary_stats['min'],
    'Q1': summary_stats['25%'],
    'median': summary_stats['50%'],
    'Q3': summary_stats['75%'],
    'max': summary_stats['max']
})

# Ensure numeric columns are of appropriate type
numeric_cols = ['mean', 'std', 'min', 'Q1', 'median', 'Q3', 'max']
plot_df[numeric_cols] = plot_df[numeric_cols].astype(float)
plot_df = plot_df.sort_values(by='median')
# Print or return plot_df to verify
print(plot_df)

                                  category  count          mean           std  \
language                                                                        
Arabic                              Arabic    1.0  5.590000e+04           NaN   
Canadian French            Canadian French    6.0  1.411833e+05  1.885353e+05   
French (France)            French (France)   10.0  5.994700e+05  1.161195e+06   
German                              German    3.0  4.501333e+05  6.510026e+05   
Russian                            Russian    2.0  1.547000e+05  1.466539e+05   
Japanese                          Japanese    5.0  2.185240e+06  4.370933e+06   
Italian                            Italian    4.0  3.526250e+05  3.314485e+05   
Swedish                            Swedish    1.0  2.100000e+06           NaN   
Australian English      Australian English    1.0  5.200000e+06           NaN   
Brazilian Portuguese  Brazilian Portuguese    1.0  7.600000e+06           NaN   
British English            B

In [220]:
# This configures what shows up when you hover your mouse over the plot.
tooltip_setup = (
    layer_tooltips()
        .line('@category')
        .line('[@Q1 -- @median -- @Q3]')
        .format('@Q1', '£ {.2f}')
        .format('@median', '£ {.2f}')
        .format('@Q3', '£ {.2f}')
)

g = (
    # Maps the columns to the aesthetics of the plot.
    ggplot(plot_df, aes(y='category', x='median', xmin='Q1', xmax='Q3', fill='category')) +

    # GEOMS

    # Add a line range that 'listens to' columns informed in `ymin` and `ymax` aesthetics
    geom_linerange(size=1, alpha=0.75, tooltips=tooltip_setup) +

    # Add points to the plot (listen to `x` and `y` and fill aesthetics)
    geom_point(size=3, stroke=1, shape=21, tooltips=tooltip_setup) +

    # SCALES

    # Remove the legend (we can already read the categories from the y-axis)
    scale_fill_discrete(guide='none') +

    # Specify names for the axes
    scale_y_continuous(name="Discount Rate Categories\n(from smallest to largest rates)", expand=[0.05, 0.05]) +
    scale_x_log10(name="Price (£)", expand=[0., 0.05], format='£ {.2f}', breaks=np.arange(0, 20, 2.5)) +

    # LABELS
    labs(title='Products with lower prices tend to get bigger discounts',
         subtitle="The highest discounted products are the products with lower median prices.") +
    theme(axis_text_x=element_text(size=15),
        axis_text_y=element_text(size=17),
        axis_title_x=element_text(size=20),
        axis_title_y=element_text(size=20),
        plot_title=element_text(size=19, face='bold'),
        plot_subtitle=element_text(size=18),
        legend_position='none') +
    ggsize(1000, 500)

)

g


In [221]:
scatter_plot = (
    ggplot(top_df, aes(x='audience_score', y='critics_score')) +
    geom_point() +
    scale_x_continuous(name='Audience Score', limits=[75, 100]) +
    scale_y_continuous(name='Critics Score', limits=[75, 100]) +
    labs(title='Scatter Plot of Audience vs Critics Score',
         x='Audience Score',
         y='Critics Score') +
    theme(
        axis_text_x=element_text(size=12),
        axis_text_y=element_text(size=12),
        axis_title_x=element_text(size=14),
        axis_title_y=element_text(size=14),
        plot_title=element_text(size=16, face='bold')
        
    ) + geom_jitter()
)

scatter_plot

In [222]:
top_df.explode('Genre')

Unnamed: 0,link,title,Director,Genre,language,date-streaming,date-theater,Runtime,Rating,Aspect Ratio,Distributor,Sound Mix,Producer,Production Co,Screenwriter,rank,revenue,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,[Curtis Hanson],Crime,English,"Dec 12, 2015",1997-09-19,136,[R],"[35mm, Scope (2.35:1)]","[Warner Home Vídeo, Warner Bros.]","[Surround, DTS, Dolby Digital]","[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]","[Curtis Hanson, James Ellroy, Brian Helgeland]",1,64600000.0,94.0,99.0
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,[Curtis Hanson],Drama,English,"Dec 12, 2015",1997-09-19,136,[R],"[35mm, Scope (2.35:1)]","[Warner Home Vídeo, Warner Bros.]","[Surround, DTS, Dolby Digital]","[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]","[Curtis Hanson, James Ellroy, Brian Helgeland]",1,64600000.0,94.0,99.0
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,[Francis Ford Coppola],Crime,English,"Aug 1, 2013",1972-03-15,177,[R],[Flat (1.85:1)],[Paramount Pictures],[Mono],[Albert S. Ruddy],[Paramount Pictures],"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",2,134800000.0,98.0,97.0
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,[Francis Ford Coppola],Drama,English,"Aug 1, 2013",1972-03-15,177,[R],[Flat (1.85:1)],[Paramount Pictures],[Mono],[Albert S. Ruddy],[Paramount Pictures],"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",2,134800000.0,98.0,97.0
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,[Michael Curtiz],Drama,English,"Aug 15, 2008",1943-01-23,102,"[P, G]",[Flat (1.37:1)],[Warner Bros. Pictures],[Mono],[Hal B. Wallis],[Warner Brothers],"[Murray Burnett, Joan Alison, Julius J. Epstei...",3,,95.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,https://www.rottentomatoes.com/m/eyes_without_...,Eyes Without a Face,[Georges Franju],Horror,Canadian French,"Oct 29, 2016",1962-10-31,90,[],[],"[United Artists, Lopert Pictures Corp., Rialto...",[],[Jules Borkon],"[Champs-Élysées Production, Lux Film S.p.a.]","[Pierre Boileau, Pierre Gascar, Thomas Narceja...",299,52700.0,87.0,97.0
298,https://www.rottentomatoes.com/m/eyes_without_...,Eyes Without a Face,[Georges Franju],Drama,Canadian French,"Oct 29, 2016",1962-10-31,90,[],[],"[United Artists, Lopert Pictures Corp., Rialto...",[],[Jules Borkon],"[Champs-Élysées Production, Lux Film S.p.a.]","[Pierre Boileau, Pierre Gascar, Thomas Narceja...",299,52700.0,87.0,97.0
299,https://www.rottentomatoes.com/m/the_cabinet_o...,The Cabinet of Dr. Caligari,[Robert Wiene],Horror,German,"Mar 22, 2016",1921-03-19,69,[],[],"[Grapevine Video, Kino Video]",[],"[Rudolf Meinert, Erich Pommer]",[Decla-Bioscop AG],"[Hans Janowitz, Carl Mayer]",300,,89.0,96.0
299,https://www.rottentomatoes.com/m/the_cabinet_o...,The Cabinet of Dr. Caligari,[Robert Wiene],Fantasy,German,"Mar 22, 2016",1921-03-19,69,[],[],"[Grapevine Video, Kino Video]",[],"[Rudolf Meinert, Erich Pommer]",[Decla-Bioscop AG],"[Hans Janowitz, Carl Mayer]",300,,89.0,96.0
