# Cleaning the Dataframe & Building a SQLite Database

## Install Neccesary Packages & Reading the JSON file

In [1]:
import pandas as pd
from datetime import datetime

In [2]:
top_df = pd.read_json("../data/raw/movies.json")

top_df.head() 

Unnamed: 0,link,title,rank,Aspect Ratio,Box Office (Gross USA),Director,Distributor,Genre,Original Language,Producer,Production Co,Rating,Release Date (Streaming),Release Date (Theaters),Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1,"[35mm, Scope (2.35:1)]",[$64.6M],[Curtis Hanson],"[Warner Home Vídeo, Warner Bros.]","[Crime, Drama]",[English],"[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]",[R],"[Dec 12, 2015]","[Sep 19, 1997, Original]",[2h 16m],"[Curtis Hanson, James Ellroy, Brian Helgeland]","[Surround, DTS, Dolby Digital]",[94%],[99%]
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2,[Flat (1.85:1)],[$134.8M],[Francis Ford Coppola],[Paramount Pictures],"[Crime, Drama]",[English],[Albert S. Ruddy],[Paramount Pictures],[R],"[Aug 1, 2013]","[Mar 15, 1972, Wide]",[2h 57m],"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",[Mono],[98%],[97%]
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3,[Flat (1.37:1)],,[Michael Curtiz],[Warner Bros. Pictures],[Drama],[English],[Hal B. Wallis],[Warner Brothers],[PG],"[Aug 15, 2008]","[Jan 23, 1943, Wide]",[1h 42m],"[Murray Burnett, Joan Alison, Julius J. Epstei...",[Mono],[95%],[99%]
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4,[Flat (1.37:1)],[$192.9K],[Akira Kurosawa],[Columbia Pictures],[Action],[Japanese],[Sojiro Motoki],[Toho Company],,"[Nov 29, 2011]","[Nov 19, 1956, Wide]",[3h 28m],"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",[Mono],[97%],[100%]
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5,[Scope (2.35:1)],[$53.4M],[Bong Joon Ho],[Neon],"[Comedy, Mystery & Thriller, Drama]",[Korean],"[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],[R (Sexual Content|Language|Some Violence)],"[Oct 11, 2019]","[Nov 1, 2019, Wide]",[2h 12m],"[Bong Joon Ho, Han Jinwon]","[Dolby Atmos, Dolby Digital]",[90%],[99%]


## Cleaning the Dataframe
We undo the list strucutre associated with many columns and convert them into the appropriate datatypes. Lastly, we format them in an ideal manner to create a database.

In [3]:
# though columns are represented as lists, some contain only one value; we create the columns as single objects
columns_to_explode = [
    'Original Language', 'Runtime', 'Release Date (Streaming)', 'Rating',
    'Release Date (Theaters)', 'Box Office (Gross USA)', 'audience_score', 'critics_score'
]

for column in columns_to_explode:
    top_df = top_df.explode(column)


In [4]:
# rename columns to more appropriate format and concise description
top_df = (
    top_df.rename(columns={
        'Box Office (Gross USA)': 'revenue',
        'Original Language': 'language',
        'Release Date (Streaming)': 'date-streaming',
        'Release Date (Theaters)': 'date-theater',
    })
)

In [5]:
top_df.head()

Unnamed: 0,link,title,rank,Aspect Ratio,revenue,Director,Distributor,Genre,language,Producer,Production Co,Rating,date-streaming,date-theater,Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1,"[35mm, Scope (2.35:1)]",$64.6M,[Curtis Hanson],"[Warner Home Vídeo, Warner Bros.]","[Crime, Drama]",English,"[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]",R,"Dec 12, 2015","Sep 19, 1997, Original",2h 16m,"[Curtis Hanson, James Ellroy, Brian Helgeland]","[Surround, DTS, Dolby Digital]",94%,99%
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2,[Flat (1.85:1)],$134.8M,[Francis Ford Coppola],[Paramount Pictures],"[Crime, Drama]",English,[Albert S. Ruddy],[Paramount Pictures],R,"Aug 1, 2013","Mar 15, 1972, Wide",2h 57m,"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",[Mono],98%,97%
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3,[Flat (1.37:1)],,[Michael Curtiz],[Warner Bros. Pictures],[Drama],English,[Hal B. Wallis],[Warner Brothers],PG,"Aug 15, 2008","Jan 23, 1943, Wide",1h 42m,"[Murray Burnett, Joan Alison, Julius J. Epstei...",[Mono],95%,99%
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4,[Flat (1.37:1)],$192.9K,[Akira Kurosawa],[Columbia Pictures],[Action],Japanese,[Sojiro Motoki],[Toho Company],,"Nov 29, 2011","Nov 19, 1956, Wide",3h 28m,"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",[Mono],97%,100%
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5,[Scope (2.35:1)],$53.4M,[Bong Joon Ho],[Neon],"[Comedy, Mystery & Thriller, Drama]",Korean,"[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],R (Sexual Content|Language|Some Violence),"Oct 11, 2019","Nov 1, 2019, Wide",2h 12m,"[Bong Joon Ho, Han Jinwon]","[Dolby Atmos, Dolby Digital]",90%,99%


In [6]:
# clean rank column into integer
top_df['rank'] = top_df['rank'].astype('str')
top_df['rank'] = top_df['rank'].str.rstrip('.')
top_df['rank'] = top_df['rank'].apply(int)

In [7]:
def clean_revenue(revenue):
    revenue = revenue.replace('$','')
    
    # movie revenues are represented in millions, thousands, and billions
    quantifier = revenue[-1]

    if quantifier == 'M':
        return float(revenue[:-1])*1000000
    elif quantifier == 'K':
        return float(revenue[:-1])*1000
    elif quantifier == 'B':
        return float(revenue[:-1])*1000000000
    else:
        return float(revenue)


In [8]:
top_df['revenue'] = top_df['revenue'].apply(lambda x: clean_revenue(x) if x else None)

In [9]:
top_df.head()

Unnamed: 0,link,title,rank,Aspect Ratio,revenue,Director,Distributor,Genre,language,Producer,Production Co,Rating,date-streaming,date-theater,Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1,"[35mm, Scope (2.35:1)]",64600000.0,[Curtis Hanson],"[Warner Home Vídeo, Warner Bros.]","[Crime, Drama]",English,"[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]",R,"Dec 12, 2015","Sep 19, 1997, Original",2h 16m,"[Curtis Hanson, James Ellroy, Brian Helgeland]","[Surround, DTS, Dolby Digital]",94%,99%
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2,[Flat (1.85:1)],134800000.0,[Francis Ford Coppola],[Paramount Pictures],"[Crime, Drama]",English,[Albert S. Ruddy],[Paramount Pictures],R,"Aug 1, 2013","Mar 15, 1972, Wide",2h 57m,"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",[Mono],98%,97%
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3,[Flat (1.37:1)],,[Michael Curtiz],[Warner Bros. Pictures],[Drama],English,[Hal B. Wallis],[Warner Brothers],PG,"Aug 15, 2008","Jan 23, 1943, Wide",1h 42m,"[Murray Burnett, Joan Alison, Julius J. Epstei...",[Mono],95%,99%
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4,[Flat (1.37:1)],192900.0,[Akira Kurosawa],[Columbia Pictures],[Action],Japanese,[Sojiro Motoki],[Toho Company],,"Nov 29, 2011","Nov 19, 1956, Wide",3h 28m,"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",[Mono],97%,100%
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5,[Scope (2.35:1)],53400000.0,[Bong Joon Ho],[Neon],"[Comedy, Mystery & Thriller, Drama]",Korean,"[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],R (Sexual Content|Language|Some Violence),"Oct 11, 2019","Nov 1, 2019, Wide",2h 12m,"[Bong Joon Ho, Han Jinwon]","[Dolby Atmos, Dolby Digital]",90%,99%


In [10]:
top_df.head()

Unnamed: 0,link,title,rank,Aspect Ratio,revenue,Director,Distributor,Genre,language,Producer,Production Co,Rating,date-streaming,date-theater,Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1,"[35mm, Scope (2.35:1)]",64600000.0,[Curtis Hanson],"[Warner Home Vídeo, Warner Bros.]","[Crime, Drama]",English,"[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]",R,"Dec 12, 2015","Sep 19, 1997, Original",2h 16m,"[Curtis Hanson, James Ellroy, Brian Helgeland]","[Surround, DTS, Dolby Digital]",94%,99%
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2,[Flat (1.85:1)],134800000.0,[Francis Ford Coppola],[Paramount Pictures],"[Crime, Drama]",English,[Albert S. Ruddy],[Paramount Pictures],R,"Aug 1, 2013","Mar 15, 1972, Wide",2h 57m,"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",[Mono],98%,97%
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3,[Flat (1.37:1)],,[Michael Curtiz],[Warner Bros. Pictures],[Drama],English,[Hal B. Wallis],[Warner Brothers],PG,"Aug 15, 2008","Jan 23, 1943, Wide",1h 42m,"[Murray Burnett, Joan Alison, Julius J. Epstei...",[Mono],95%,99%
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4,[Flat (1.37:1)],192900.0,[Akira Kurosawa],[Columbia Pictures],[Action],Japanese,[Sojiro Motoki],[Toho Company],,"Nov 29, 2011","Nov 19, 1956, Wide",3h 28m,"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",[Mono],97%,100%
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5,[Scope (2.35:1)],53400000.0,[Bong Joon Ho],[Neon],"[Comedy, Mystery & Thriller, Drama]",Korean,"[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],R (Sexual Content|Language|Some Violence),"Oct 11, 2019","Nov 1, 2019, Wide",2h 12m,"[Bong Joon Ho, Han Jinwon]","[Dolby Atmos, Dolby Digital]",90%,99%


In [11]:
def time_to_minutes(runtime):
    total_time = 0
    if 'h' and 'm' in runtime:
        halves = runtime.split()
        for half in halves:
            if 'h' in half:
                total_time += 60*int(half.replace('h',''))
            elif 'm' in half:
                total_time += int(half.replace('m',''))
        return total_time
    if 'm' in runtime:
        return runtime.rstrip('m')
        


In [12]:
top_df['Runtime'] = top_df['Runtime'].apply(time_to_minutes)

In [13]:
def clean_date(dateStr):
    dateStr = str(dateStr.split(',')[0])+str(dateStr.split(',')[1])

    date_format = '%b %d %Y'

    return datetime.strptime(dateStr, date_format).date()



In [14]:
top_df['date-theater'] = top_df['date-theater'].apply(lambda x: clean_date(x) if x else None)

In [15]:
top_df.head(10)

Unnamed: 0,link,title,rank,Aspect Ratio,revenue,Director,Distributor,Genre,language,Producer,Production Co,Rating,date-streaming,date-theater,Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1,"[35mm, Scope (2.35:1)]",64600000.0,[Curtis Hanson],"[Warner Home Vídeo, Warner Bros.]","[Crime, Drama]",English,"[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]",R,"Dec 12, 2015",1997-09-19,136,"[Curtis Hanson, James Ellroy, Brian Helgeland]","[Surround, DTS, Dolby Digital]",94%,99%
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2,[Flat (1.85:1)],134800000.0,[Francis Ford Coppola],[Paramount Pictures],"[Crime, Drama]",English,[Albert S. Ruddy],[Paramount Pictures],R,"Aug 1, 2013",1972-03-15,177,"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",[Mono],98%,97%
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3,[Flat (1.37:1)],,[Michael Curtiz],[Warner Bros. Pictures],[Drama],English,[Hal B. Wallis],[Warner Brothers],PG,"Aug 15, 2008",1943-01-23,102,"[Murray Burnett, Joan Alison, Julius J. Epstei...",[Mono],95%,99%
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4,[Flat (1.37:1)],192900.0,[Akira Kurosawa],[Columbia Pictures],[Action],Japanese,[Sojiro Motoki],[Toho Company],,"Nov 29, 2011",1956-11-19,208,"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",[Mono],97%,100%
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5,[Scope (2.35:1)],53400000.0,[Bong Joon Ho],[Neon],"[Comedy, Mystery & Thriller, Drama]",Korean,"[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],R (Sexual Content|Language|Some Violence),"Oct 11, 2019",2019-11-01,132,"[Bong Joon Ho, Han Jinwon]","[Dolby Atmos, Dolby Digital]",90%,99%
5,https://www.rottentomatoes.com/m/schindlers_list,Schindler's List,6,,96600000.0,[Steven Spielberg],[Universal Pictures],"[History, Drama]",English,"[Branko Lustig, Gerald R. Molen, Steven Spielb...","[Universal Pictures, Amblin Entertainment]",R (Language|Actuality Violence|Some Sexuality),"Mar 5, 2013",1993-12-15,195,[Steven Zaillian],"[Dolby Atmos, Stereo, Dolby Digital, DTS, Surr...",97%,98%
6,https://www.rottentomatoes.com/m/top_gun_maverick,Top Gun: Maverick,7,[Scope (2.35:1)],718500000.0,[Joseph Kosinski],[Paramount Pictures],"[Action, Adventure]",English,"[Jerry Bruckheimer, Tom Cruise, David Ellison,...","[Don Simpson/Jerry Bruckheimer Films, Paramoun...",PG-13 (Some Strong Language|Sequences of Inten...,"Aug 22, 2022",2022-05-27,131,"[Ehren Kruger, Eric Warren Singer, Christopher...",[Dolby Atmos],99%,96%
7,https://www.rottentomatoes.com/m/toy_story_2,Toy Story 2,8,"[Digital Projection, Flat (1.85:1)]",245900000.0,"[Ash Brannon, John Lasseter, Lee Unkrich]",[Walt Disney Pictures],"[Kids & Family, Comedy, Adventure, Fantasy, An...",English,"[Karen Robert Jackson, Helene Plotkin]","[Walt Disney Pictures, Pixar Animation Studios]",G,"Jan 1, 2014",1999-11-24,92,"[John Lasseter, Pete Docter, Ash Brannon, Andr...","[Dolby SR, Dolby Stereo, Surround, SDDS, DTS, ...",87%,100%
8,https://www.rottentomatoes.com/m/chinatown,Chinatown,9,[Scope (2.35:1)],,[Roman Polanski],[Paramount Pictures],"[Crime, Drama]",English,[Robert Evans],[Paramount Pictures],R,"Aug 1, 2013",1974-06-20,131,"[Robert Towne, Roman Polanski]",[Mono],93%,98%
9,https://www.rottentomatoes.com/m/on_the_waterf...,On the Waterfront,10,[35mm],,[Elia Kazan],[Columbia Pictures],[Drama],English,[Sam Spiegel],[Columbia Pictures Corporation],,"Jan 14, 2014",1954-07-28,108,"[Budd Schulberg, Budd Schulberg]",[Mono],95%,99%


In [16]:
def clean_rating(ratingText):
    # seperate sub-ratings into individual elements within a new list
    if "(" in ratingText:
        words = ratingText.split("(")
        subRatings = words[1].rstrip(")").split("|")
        for subRating in subRatings:
            words.append(subRating)
        del words[1]
        words[0].strip()
        return words
    elif ratingText=="PG":
        return ["PG"]
    else:
        return list(ratingText)

In [17]:
top_df['Rating'] = top_df['Rating'].apply(lambda x: clean_rating(x) if x else None)

In [18]:
top_df.head()

Unnamed: 0,link,title,rank,Aspect Ratio,revenue,Director,Distributor,Genre,language,Producer,Production Co,Rating,date-streaming,date-theater,Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1,"[35mm, Scope (2.35:1)]",64600000.0,[Curtis Hanson],"[Warner Home Vídeo, Warner Bros.]","[Crime, Drama]",English,"[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]",[R],"Dec 12, 2015",1997-09-19,136,"[Curtis Hanson, James Ellroy, Brian Helgeland]","[Surround, DTS, Dolby Digital]",94%,99%
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2,[Flat (1.85:1)],134800000.0,[Francis Ford Coppola],[Paramount Pictures],"[Crime, Drama]",English,[Albert S. Ruddy],[Paramount Pictures],[R],"Aug 1, 2013",1972-03-15,177,"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",[Mono],98%,97%
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3,[Flat (1.37:1)],,[Michael Curtiz],[Warner Bros. Pictures],[Drama],English,[Hal B. Wallis],[Warner Brothers],[PG],"Aug 15, 2008",1943-01-23,102,"[Murray Burnett, Joan Alison, Julius J. Epstei...",[Mono],95%,99%
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4,[Flat (1.37:1)],192900.0,[Akira Kurosawa],[Columbia Pictures],[Action],Japanese,[Sojiro Motoki],[Toho Company],,"Nov 29, 2011",1956-11-19,208,"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",[Mono],97%,100%
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5,[Scope (2.35:1)],53400000.0,[Bong Joon Ho],[Neon],"[Comedy, Mystery & Thriller, Drama]",Korean,"[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],"[R , Sexual Content, Language, Some Violence]","Oct 11, 2019",2019-11-01,132,"[Bong Joon Ho, Han Jinwon]","[Dolby Atmos, Dolby Digital]",90%,99%


In [19]:
# convert rating scores as float numbers
top_df['audience_score'] = top_df['audience_score'].apply(lambda x: x.replace('%',''))
top_df['critics_score'] = top_df['critics_score'].apply(lambda x: x.replace('%',''))
top_df['audience_score'] = top_df['audience_score'].astype(float)
top_df['critics_score'] = top_df['critics_score'].astype(float)

In [20]:
top_df.head()

Unnamed: 0,link,title,rank,Aspect Ratio,revenue,Director,Distributor,Genre,language,Producer,Production Co,Rating,date-streaming,date-theater,Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1,"[35mm, Scope (2.35:1)]",64600000.0,[Curtis Hanson],"[Warner Home Vídeo, Warner Bros.]","[Crime, Drama]",English,"[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]",[R],"Dec 12, 2015",1997-09-19,136,"[Curtis Hanson, James Ellroy, Brian Helgeland]","[Surround, DTS, Dolby Digital]",94.0,99.0
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2,[Flat (1.85:1)],134800000.0,[Francis Ford Coppola],[Paramount Pictures],"[Crime, Drama]",English,[Albert S. Ruddy],[Paramount Pictures],[R],"Aug 1, 2013",1972-03-15,177,"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",[Mono],98.0,97.0
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3,[Flat (1.37:1)],,[Michael Curtiz],[Warner Bros. Pictures],[Drama],English,[Hal B. Wallis],[Warner Brothers],[PG],"Aug 15, 2008",1943-01-23,102,"[Murray Burnett, Joan Alison, Julius J. Epstei...",[Mono],95.0,99.0
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4,[Flat (1.37:1)],192900.0,[Akira Kurosawa],[Columbia Pictures],[Action],Japanese,[Sojiro Motoki],[Toho Company],,"Nov 29, 2011",1956-11-19,208,"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",[Mono],97.0,100.0
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5,[Scope (2.35:1)],53400000.0,[Bong Joon Ho],[Neon],"[Comedy, Mystery & Thriller, Drama]",Korean,"[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],"[R , Sexual Content, Language, Some Violence]","Oct 11, 2019",2019-11-01,132,"[Bong Joon Ho, Han Jinwon]","[Dolby Atmos, Dolby Digital]",90.0,99.0


In [21]:
# this function converts lists into strings where each element is seperated by "%%%"
def list_to_string_converter(list_from_df):
    if not isinstance(list_from_df, list):
        return ""
    final_str = ""
    for items in list_from_df:
        final_str = final_str + "%%%" + str(items)
    return final_str

In [22]:
# we apply this function to create a sql database, as it cannot handle lists within columns
columns_to_convert = ['Aspect Ratio', 'Director', 'Distributor', 'Genre', 
                      'Producer', 'Production Co', 'Rating', 'Screenwriter', 'Sound Mix']
for column in columns_to_convert:
    top_df[column] = top_df[column].apply(lambda x: list_to_string_converter(x) if isinstance(x, list) else "")

In [23]:
top_df

Unnamed: 0,link,title,rank,Aspect Ratio,revenue,Director,Distributor,Genre,language,Producer,Production Co,Rating,date-streaming,date-theater,Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1,%%%35mm%%%Scope (2.35:1),64600000.0,%%%Curtis Hanson,%%%Warner Home Vídeo%%%Warner Bros.,%%%Crime%%%Drama,English,%%%Michael G. Nathanson%%%Arnon Milchan%%%Curt...,%%%Warner Brothers%%%Regency Enterprises,%%%R,"Dec 12, 2015",1997-09-19,136,%%%Curtis Hanson%%%James Ellroy%%%Brian Helgeland,%%%Surround%%%DTS%%%Dolby Digital,94.0,99.0
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2,%%%Flat (1.85:1),134800000.0,%%%Francis Ford Coppola,%%%Paramount Pictures,%%%Crime%%%Drama,English,%%%Albert S. Ruddy,%%%Paramount Pictures,%%%R,"Aug 1, 2013",1972-03-15,177,%%%Francis Ford Coppola%%%Mario Puzo%%%Mario Puzo,%%%Mono,98.0,97.0
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3,%%%Flat (1.37:1),,%%%Michael Curtiz,%%%Warner Bros. Pictures,%%%Drama,English,%%%Hal B. Wallis,%%%Warner Brothers,%%%PG,"Aug 15, 2008",1943-01-23,102,%%%Murray Burnett%%%Joan Alison%%%Julius J. Ep...,%%%Mono,95.0,99.0
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4,%%%Flat (1.37:1),192900.0,%%%Akira Kurosawa,%%%Columbia Pictures,%%%Action,Japanese,%%%Sojiro Motoki,%%%Toho Company,,"Nov 29, 2011",1956-11-19,208,%%%Shinobu Hashimoto%%%Akira Kurosawa%%%Hideo ...,%%%Mono,97.0,100.0
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5,%%%Scope (2.35:1),53400000.0,%%%Bong Joon Ho,%%%Neon,%%%Comedy%%%Mystery & Thriller%%%Drama,Korean,%%%Kwak Sin-ae%%%Moon Yanggwon,%%%Barunson E&A,%%%R %%%Sexual Content%%%Language%%%Some Violence,"Oct 11, 2019",2019-11-01,132,%%%Bong Joon Ho%%%Han Jinwon,%%%Dolby Atmos%%%Dolby Digital,90.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,https://www.rottentomatoes.com/m/1001902-beaut...,Beauty and the Beast,296,%%%Flat (1.37:1),138200.0,%%%Jean Cocteau,,%%%Fantasy,Canadian French,,,,"Jan 14, 2017",1947-01-01,95,%%%Jean Cocteau%%%Jeanne-Marie Leprince de Bea...,%%%Mono,90.0,96.0
296,https://www.rottentomatoes.com/m/killing,The Killing,297,,,%%%Stanley Kubrick,%%%United Artists%%%Criterion Collection,%%%Crime%%%Drama,English,%%%James B. Harris,%%%Harris-Kubrick Productions,,"Mar 5, 2016",1956-05-20,83,%%%Stanley Kubrick%%%Jim Thompson%%%Lionel White,,92.0,96.0
297,https://www.rottentomatoes.com/m/the_rules_of_...,The Rules of the Game,298,%%%35mm%%%Flat (1.37:1),,%%%Jean Renoir,%%%Criterion Collection%%%Cine Classics,%%%Comedy%%%Drama,French (France),%%%Claude Renoir,%%%Nouvelles Éditions de Films (NEF),,"Jul 21, 2009",1939-07-08,110,%%%Carl Koch%%%Jean Renoir,,89.0,97.0
298,https://www.rottentomatoes.com/m/eyes_without_...,Eyes Without a Face,299,,52700.0,%%%Georges Franju,%%%United Artists%%%Lopert Pictures Corp.%%%Ri...,%%%Horror%%%Drama,Canadian French,%%%Jules Borkon,%%%Champs-Élysées Production%%%Lux Film S.p.a.,,"Oct 29, 2016",1962-10-31,90,%%%Pierre Boileau%%%Pierre Gascar%%%Thomas Nar...,,87.0,97.0


In [24]:
# every column is of the datatype we want
top_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 299
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   link            300 non-null    object 
 1   title           300 non-null    object 
 2   rank            300 non-null    int64  
 3   Aspect Ratio    300 non-null    object 
 4   revenue         187 non-null    float64
 5   Director        300 non-null    object 
 6   Distributor     300 non-null    object 
 7   Genre           300 non-null    object 
 8   language        296 non-null    object 
 9   Producer        300 non-null    object 
 10  Production Co   300 non-null    object 
 11  Rating          300 non-null    object 
 12  date-streaming  298 non-null    object 
 13  date-theater    296 non-null    object 
 14  Runtime         300 non-null    int64  
 15  Screenwriter    300 non-null    object 
 16  Sound Mix       300 non-null    object 
 17  audience_score  300 non-null    flo

## Creating the SQL Database
We create three tables, one for general movie information, another for technical details, another for reception-related statistics. In the end, we pd merge them together.

In [25]:
# packages for sql database creation
import os
import sqlite3

In [26]:
DATA_FOLDER = os.path.join('../data/clean/')

In [27]:
conn = sqlite3.connect(os.path.join(DATA_FOLDER, 'moviedatabase.db'))

In [28]:
selected_columns = ['link', 'title', 'Director', 'Genre', 'language', 'date-streaming', 'date-theater', 
                    'Runtime', "Rating"]

movies = top_df[selected_columns].copy().drop_duplicates()

In [29]:
movies.to_sql('movies', conn, if_exists='replace', index=False)

300

In [30]:
pd.read_sql('SELECT * FROM movies LIMIT 5', conn)


Unnamed: 0,link,title,Director,Genre,language,date-streaming,date-theater,Runtime,Rating
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,%%%Curtis Hanson,%%%Crime%%%Drama,English,"Dec 12, 2015",1997-09-19,136,%%%R
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,%%%Francis Ford Coppola,%%%Crime%%%Drama,English,"Aug 1, 2013",1972-03-15,177,%%%R
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,%%%Michael Curtiz,%%%Drama,English,"Aug 15, 2008",1943-01-23,102,%%%PG
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,%%%Akira Kurosawa,%%%Action,Japanese,"Nov 29, 2011",1956-11-19,208,
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,%%%Bong Joon Ho,%%%Comedy%%%Mystery & Thriller%%%Drama,Korean,"Oct 11, 2019",2019-11-01,132,%%%R %%%Sexual Content%%%Language%%%Some Violence


In [31]:
technicalities = (
    top_df[['link', 'Aspect Ratio', 'Distributor', 'Sound Mix','Producer', 'Production Co', 'Screenwriter']].drop_duplicates()
)

In [32]:
technicalities.to_sql('technicalities', conn, if_exists='replace', index=False)

300

In [33]:
post_release = (
    top_df[['link','rank', 'revenue', 'audience_score', 'critics_score']].drop_duplicates()
)

In [34]:
post_release.to_sql('post_release', conn, if_exists='replace', index=False)

300

In [35]:
pd.read_sql('SELECT * FROM post_release LIMIT 5', conn)


Unnamed: 0,link,rank,revenue,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,1,64600000.0,94.0,99.0
1,https://www.rottentomatoes.com/m/the_godfather,2,134800000.0,98.0,97.0
2,https://www.rottentomatoes.com/m/1003707-casab...,3,,95.0,99.0
3,https://www.rottentomatoes.com/m/seven_samurai...,4,192900.0,97.0,100.0
4,https://www.rottentomatoes.com/m/parasite_2019,5,53400000.0,90.0,99.0


In [36]:
(
    pd.read_sql('SELECT * FROM movies', conn)
        .merge(pd.read_sql('SELECT * FROM technicalities', conn), 
                left_on='link', 
                right_on='link', 
                how='left')
        .merge(pd.read_sql('SELECT * FROM post_release', conn), 
                left_on='link', 
                right_on='link', 
                how='left')
)

Unnamed: 0,link,title,Director,Genre,language,date-streaming,date-theater,Runtime,Rating,Aspect Ratio,Distributor,Sound Mix,Producer,Production Co,Screenwriter,rank,revenue,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,%%%Curtis Hanson,%%%Crime%%%Drama,English,"Dec 12, 2015",1997-09-19,136,%%%R,%%%35mm%%%Scope (2.35:1),%%%Warner Home Vídeo%%%Warner Bros.,%%%Surround%%%DTS%%%Dolby Digital,%%%Michael G. Nathanson%%%Arnon Milchan%%%Curt...,%%%Warner Brothers%%%Regency Enterprises,%%%Curtis Hanson%%%James Ellroy%%%Brian Helgeland,1,64600000.0,94.0,99.0
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,%%%Francis Ford Coppola,%%%Crime%%%Drama,English,"Aug 1, 2013",1972-03-15,177,%%%R,%%%Flat (1.85:1),%%%Paramount Pictures,%%%Mono,%%%Albert S. Ruddy,%%%Paramount Pictures,%%%Francis Ford Coppola%%%Mario Puzo%%%Mario Puzo,2,134800000.0,98.0,97.0
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,%%%Michael Curtiz,%%%Drama,English,"Aug 15, 2008",1943-01-23,102,%%%PG,%%%Flat (1.37:1),%%%Warner Bros. Pictures,%%%Mono,%%%Hal B. Wallis,%%%Warner Brothers,%%%Murray Burnett%%%Joan Alison%%%Julius J. Ep...,3,,95.0,99.0
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,%%%Akira Kurosawa,%%%Action,Japanese,"Nov 29, 2011",1956-11-19,208,,%%%Flat (1.37:1),%%%Columbia Pictures,%%%Mono,%%%Sojiro Motoki,%%%Toho Company,%%%Shinobu Hashimoto%%%Akira Kurosawa%%%Hideo ...,4,192900.0,97.0,100.0
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,%%%Bong Joon Ho,%%%Comedy%%%Mystery & Thriller%%%Drama,Korean,"Oct 11, 2019",2019-11-01,132,%%%R %%%Sexual Content%%%Language%%%Some Violence,%%%Scope (2.35:1),%%%Neon,%%%Dolby Atmos%%%Dolby Digital,%%%Kwak Sin-ae%%%Moon Yanggwon,%%%Barunson E&A,%%%Bong Joon Ho%%%Han Jinwon,5,53400000.0,90.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,https://www.rottentomatoes.com/m/1001902-beaut...,Beauty and the Beast,%%%Jean Cocteau,%%%Fantasy,Canadian French,"Jan 14, 2017",1947-01-01,95,,%%%Flat (1.37:1),,%%%Mono,,,%%%Jean Cocteau%%%Jeanne-Marie Leprince de Bea...,296,138200.0,90.0,96.0
296,https://www.rottentomatoes.com/m/killing,The Killing,%%%Stanley Kubrick,%%%Crime%%%Drama,English,"Mar 5, 2016",1956-05-20,83,,,%%%United Artists%%%Criterion Collection,,%%%James B. Harris,%%%Harris-Kubrick Productions,%%%Stanley Kubrick%%%Jim Thompson%%%Lionel White,297,,92.0,96.0
297,https://www.rottentomatoes.com/m/the_rules_of_...,The Rules of the Game,%%%Jean Renoir,%%%Comedy%%%Drama,French (France),"Jul 21, 2009",1939-07-08,110,,%%%35mm%%%Flat (1.37:1),%%%Criterion Collection%%%Cine Classics,,%%%Claude Renoir,%%%Nouvelles Éditions de Films (NEF),%%%Carl Koch%%%Jean Renoir,298,,89.0,97.0
298,https://www.rottentomatoes.com/m/eyes_without_...,Eyes Without a Face,%%%Georges Franju,%%%Horror%%%Drama,Canadian French,"Oct 29, 2016",1962-10-31,90,,,%%%United Artists%%%Lopert Pictures Corp.%%%Ri...,,%%%Jules Borkon,%%%Champs-Élysées Production%%%Lux Film S.p.a.,%%%Pierre Boileau%%%Pierre Gascar%%%Thomas Nar...,299,52700.0,87.0,97.0
