# Movies

In [1]:
import pandas as pd
import numpy as np
import re

from tqdm import tqdm

tqdm.pandas()

pd.set_option("display.max_colwidth", None)

In [2]:
df = pd.read_csv("data/movies.csv")
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\r\nAction, Horror, Thriller",6.1,\r\nA woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.,"\r\n Director:\r\nPeter Thorwarth\r\n| \r\n Stars:\r\nPeri Baumeister, \r\nCarl Anton Koch, \r\nAlexander Scheer, \r\nKais Setti\r\n",21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\r\nAnimation, Action, Adventure",5.0,\r\nThe war for Eternia begins again in what may be the final battle between He-Man and Skeletor. A new animated series from writer-director Kevin Smith.,"\r\n \r\n Stars:\r\nChris Wood, \r\nSarah Michelle Gellar, \r\nLena Headey, \r\nMark Hamill\r\n",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\r\nDrama, Horror, Thriller",8.2,\r\nSheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.,"\r\n \r\n Stars:\r\nAndrew Lincoln, \r\nNorman Reedus, \r\nMelissa McBride, \r\nLauren Cohan\r\n",885805.0,44.0,
3,Rick and Morty,(2013– ),"\r\nAnimation, Adventure, Comedy",9.2,\r\nAn animated series that follows the exploits of a super scientist and his not-so-bright grandson.,"\r\n \r\n Stars:\r\nJustin Roiland, \r\nChris Parnell, \r\nSpencer Grammer, \r\nSarah Chalke\r\n",414849.0,23.0,
4,Army of Thieves,(2021),"\r\nAction, Crime, Horror",,"\r\nA prequel, set before the events of Army of the Dead, which focuses on German safecracker Ludwig Dieter leading a group of aspiring thieves on a top secret heist during the early stages of the zombie apocalypse.","\r\n Director:\r\nMatthias Schweighöfer\r\n| \r\n Stars:\r\nMatthias Schweighöfer, \r\nNathalie Emmanuel, \r\nRuby O. Fee, \r\nStuart Martin\r\n",,,


In [3]:
df.shape

(9999, 9)

In [4]:
df_missing = df.isna().sum().to_frame()
df_missing["percentage"] = (df_missing[0] / df.shape[0]) * 100
df_missing

Unnamed: 0,0,percentage
MOVIES,0,0.0
YEAR,644,6.440644
GENRE,80,0.80008
RATING,1820,18.20182
ONE-LINE,0,0.0
STARS,0,0.0
VOTES,1820,18.20182
RunTime,2958,29.582958
Gross,9539,95.39954


## Data Cleaning

### Year

In [5]:
df["YEAR"] = df["YEAR"].str.replace("(", "", regex=False)
df["YEAR"] = df["YEAR"].str.replace(")", "", regex=False)
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,"\r\nAction, Horror, Thriller",6.1,\r\nA woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.,"\r\n Director:\r\nPeter Thorwarth\r\n| \r\n Stars:\r\nPeri Baumeister, \r\nCarl Anton Koch, \r\nAlexander Scheer, \r\nKais Setti\r\n",21062.0,121.0,
1,Masters of the Universe: Revelation,2021–,"\r\nAnimation, Action, Adventure",5.0,\r\nThe war for Eternia begins again in what may be the final battle between He-Man and Skeletor. A new animated series from writer-director Kevin Smith.,"\r\n \r\n Stars:\r\nChris Wood, \r\nSarah Michelle Gellar, \r\nLena Headey, \r\nMark Hamill\r\n",17870.0,25.0,
2,The Walking Dead,2010–2022,"\r\nDrama, Horror, Thriller",8.2,\r\nSheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.,"\r\n \r\n Stars:\r\nAndrew Lincoln, \r\nNorman Reedus, \r\nMelissa McBride, \r\nLauren Cohan\r\n",885805.0,44.0,
3,Rick and Morty,2013–,"\r\nAnimation, Adventure, Comedy",9.2,\r\nAn animated series that follows the exploits of a super scientist and his not-so-bright grandson.,"\r\n \r\n Stars:\r\nJustin Roiland, \r\nChris Parnell, \r\nSpencer Grammer, \r\nSarah Chalke\r\n",414849.0,23.0,
4,Army of Thieves,2021,"\r\nAction, Crime, Horror",,"\r\nA prequel, set before the events of Army of the Dead, which focuses on German safecracker Ludwig Dieter leading a group of aspiring thieves on a top secret heist during the early stages of the zombie apocalypse.","\r\n Director:\r\nMatthias Schweighöfer\r\n| \r\n Stars:\r\nMatthias Schweighöfer, \r\nNathalie Emmanuel, \r\nRuby O. Fee, \r\nStuart Martin\r\n",,,


In [6]:
df["TYPE"] = np.where(df["YEAR"].str.contains("–"), "Series", "Movie")
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE
0,Blood Red Sky,2021,"\r\nAction, Horror, Thriller",6.1,\r\nA woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.,"\r\n Director:\r\nPeter Thorwarth\r\n| \r\n Stars:\r\nPeri Baumeister, \r\nCarl Anton Koch, \r\nAlexander Scheer, \r\nKais Setti\r\n",21062.0,121.0,,Movie
1,Masters of the Universe: Revelation,2021–,"\r\nAnimation, Action, Adventure",5.0,\r\nThe war for Eternia begins again in what may be the final battle between He-Man and Skeletor. A new animated series from writer-director Kevin Smith.,"\r\n \r\n Stars:\r\nChris Wood, \r\nSarah Michelle Gellar, \r\nLena Headey, \r\nMark Hamill\r\n",17870.0,25.0,,Series
2,The Walking Dead,2010–2022,"\r\nDrama, Horror, Thriller",8.2,\r\nSheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.,"\r\n \r\n Stars:\r\nAndrew Lincoln, \r\nNorman Reedus, \r\nMelissa McBride, \r\nLauren Cohan\r\n",885805.0,44.0,,Series
3,Rick and Morty,2013–,"\r\nAnimation, Adventure, Comedy",9.2,\r\nAn animated series that follows the exploits of a super scientist and his not-so-bright grandson.,"\r\n \r\n Stars:\r\nJustin Roiland, \r\nChris Parnell, \r\nSpencer Grammer, \r\nSarah Chalke\r\n",414849.0,23.0,,Series
4,Army of Thieves,2021,"\r\nAction, Crime, Horror",,"\r\nA prequel, set before the events of Army of the Dead, which focuses on German safecracker Ludwig Dieter leading a group of aspiring thieves on a top secret heist during the early stages of the zombie apocalypse.","\r\n Director:\r\nMatthias Schweighöfer\r\n| \r\n Stars:\r\nMatthias Schweighöfer, \r\nNathalie Emmanuel, \r\nRuby O. Fee, \r\nStuart Martin\r\n",,,,Movie


In [7]:
def extract_from(row):
    x = row["YEAR"]
    if pd.isna(x):
        return pd.NA

    if "–" not in x:
        year = x
    else:
        # [2010, 2022] = 2010–2022
        # [2010, ] = 2010–
        year = x.split("–")[0]

    year = re.sub("[^0-9]", "", year) # not within the range of 0 to 9
    if len(year) > 0: # some years might become an empty string e.g. index = 1165
        year = int(year)
    else:
        year = pd.NA
        
    return year


def extract_to(row):
    x = row["YEAR"]
    if pd.isna(x):
        return pd.NA

    if "–" not in x:
        year = ""
    else:
        # [2010, 2022] = 2010–2022
        # [2010, ] = 2010–
        year = x.split("–")[1]

    year = re.sub("[^0-9]", "", year) # not within the range of 0 to 9
    if len(year) > 0: # some years might become an empty string e.g. index = 1165
        year = int(year)
    else:
        year = pd.NA
        
    return year

df["YEAR_FROM"] = df.apply(extract_from, axis=1)
df["YEAR_TO"] = df.apply(extract_to, axis=1)

df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,YEAR_FROM,YEAR_TO
0,Blood Red Sky,2021,"\r\nAction, Horror, Thriller",6.1,\r\nA woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.,"\r\n Director:\r\nPeter Thorwarth\r\n| \r\n Stars:\r\nPeri Baumeister, \r\nCarl Anton Koch, \r\nAlexander Scheer, \r\nKais Setti\r\n",21062.0,121.0,,Movie,2021,
1,Masters of the Universe: Revelation,2021–,"\r\nAnimation, Action, Adventure",5.0,\r\nThe war for Eternia begins again in what may be the final battle between He-Man and Skeletor. A new animated series from writer-director Kevin Smith.,"\r\n \r\n Stars:\r\nChris Wood, \r\nSarah Michelle Gellar, \r\nLena Headey, \r\nMark Hamill\r\n",17870.0,25.0,,Series,2021,
2,The Walking Dead,2010–2022,"\r\nDrama, Horror, Thriller",8.2,\r\nSheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.,"\r\n \r\n Stars:\r\nAndrew Lincoln, \r\nNorman Reedus, \r\nMelissa McBride, \r\nLauren Cohan\r\n",885805.0,44.0,,Series,2010,2022.0
3,Rick and Morty,2013–,"\r\nAnimation, Adventure, Comedy",9.2,\r\nAn animated series that follows the exploits of a super scientist and his not-so-bright grandson.,"\r\n \r\n Stars:\r\nJustin Roiland, \r\nChris Parnell, \r\nSpencer Grammer, \r\nSarah Chalke\r\n",414849.0,23.0,,Series,2013,
4,Army of Thieves,2021,"\r\nAction, Crime, Horror",,"\r\nA prequel, set before the events of Army of the Dead, which focuses on German safecracker Ludwig Dieter leading a group of aspiring thieves on a top secret heist during the early stages of the zombie apocalypse.","\r\n Director:\r\nMatthias Schweighöfer\r\n| \r\n Stars:\r\nMatthias Schweighöfer, \r\nNathalie Emmanuel, \r\nRuby O. Fee, \r\nStuart Martin\r\n",,,,Movie,2021,


In [8]:
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,YEAR_FROM,YEAR_TO
8740,The Woods,2020,"\r\nCrime, Drama, Mystery",6.8,"\r\nA student's essay spooks Laura, and terrifying memories of the woods come rushing back. Pawel presses Inspector Jork to speak with Artur's parents.","\r\n Director:\r\nLeszek Dawid\r\n| \r\n Stars:\r\nGrzegorz Damiecki, \r\nAgnieszka Grochowska, \r\nHubert Milkowski, \r\nWiktoria Filus\r\n",160.0,,,Movie,2020,
8887,Transformers: War for Cybertron,2020–2021,"\r\nAnimation, Action, Adventure",6.6,\r\nMegatron's continued use of the Golden Disk to guide his actions threatens to derail Optimus Prime's attempt to obtain the Allspark.,"\r\n Directors:\r\nTakashi Kamei, \r\nKazuma Shimizu\r\n| \r\n Stars:\r\nJason Marnocha, \r\nJake Foushee, \r\nJustin Luther, \r\nFrank Todaro\r\n",53.0,26.0,,Series,2020,2021.0
9235,Dive Club,2021–,"\r\nDrama, Family, Mystery",,\r\nThe truth is pieced together as one of the girls is forced to choose between friends and family.,"\r\n Director:\r\nRhiannon Bannenberg\r\n| \r\n Stars:\r\nMiah Madden, \r\nGeorgia-May Davis, \r\nSana'a Shaik, \r\nAubri Ibrag\r\n",,,,Series,2021,
6697,Never Have I Ever,2020–,\r\nComedy,8.2,"\r\nAfter another fallout with her friends, Devi grapples with unresolved trauma. A party at Ben's turns out to be equal parts awkward, awful and amazing.","\r\n Director:\r\nAnu Valia\r\n| \r\n Stars:\r\nMaitreyi Ramakrishnan, \r\nRicha Moorjani, \r\nJaren Lewison, \r\nDarren Barnet\r\n",674.0,28.0,,Series,2020,
7639,BoJack Horseman,2014–2020,"\r\nAnimation, Comedy, Drama",9.3,"\r\nAfter the Sarah Lynn story breaks, BoJack gives a live interview on TV. Diane meets Guy's teenage son.","\r\n Director:\r\nAaron Long\r\n| \r\n Stars:\r\nWill Arnett, \r\nAmy Sedaris, \r\nAlison Brie, \r\nPaul F. Tompkins\r\n",2578.0,26.0,,Series,2014,2020.0
1316,Rocco,2016,\r\nDocumentary,5.7,\r\nA behind-the-scene account of the porn world and its stars as they've never been seen before - and the no-holds-barred portrait of a true giant.,"\r\n Directors:\r\nThierry Demaizière, \r\nAlban Teurlai\r\n| \r\n Stars:\r\nRocco Siffredi, \r\nAnikka Albrite, \r\nVeronica Avluv, \r\nEva Berger\r\n",3700.0,105.0,,Movie,2016,
748,Spartacus: Gods of the Arena,2011,"\r\nAction, Adventure, Biography",8.5,"\r\nIn the time before the arrival of Spartacus, the House of Batiatus faces many challenges from competitors, and within its own household.","\r\n \r\n Stars:\r\nJohn Hannah, \r\nManu Bennett, \r\nPeter Mensah, \r\nDustin Clare\r\n",130057.0,331.0,,Movie,2011,
5624,Mr. Peabody & Sherman: A Journey WABAC,2014,"\r\nDocumentary, Short, Family",7.7,\r\nGo WABAC in time to 1959 for a look at the groundbreaking cartoon Peabody's Improbable History and get a sneak peek at the film Mr. Peabody & Sherman.,"\r\n \r\n Stars:\r\nTy Burrell, \r\nMax Charles, \r\nStephen Colbert, \r\nLeslie Mann\r\n",106.0,,,Movie,2014,
3562,Pete Davidson: Alive from New York,2020 TV Special,\r\nComedy,6.1,\r\nComedian Pete Davidson's intimate and candid stand-up special shot live in New York City.,"\r\n Director:\r\nJason Orley\r\n| \r\n Stars:\r\nPete Davidson, \r\nElliot Schiff\r\n",3278.0,49.0,,Movie,2020,
6799,Dexter,2006–2013,"\r\nCrime, Drama, Mystery",8.7,"\r\nDexter discovers that the man is still alive who murdered his mother in front of him as a boy, and confronts the killer as part of his recovery from addiction - but he also discovers that old impulses die hard.","\r\n Director:\r\nKeith Gordon\r\n| \r\n Stars:\r\nMichael C. Hall, \r\nJulie Benz, \r\nJennifer Carpenter, \r\nErik King\r\n",4236.0,56.0,,Series,2006,2013.0


In [9]:
df_test = pd.DataFrame(data = {"test": [None, np.nan, pd.NA]})
df_test

Unnamed: 0,test
0,
1,
2,


In [10]:
df_test.isna()

Unnamed: 0,test
0,True
1,True
2,True


In [11]:
for index, row in df_test.iterrows():
    obj = row["test"]
    print(obj, type(obj), obj is None, pd.isna(obj))

None <class 'NoneType'> True True
nan <class 'float'> False True
<NA> <class 'pandas._libs.missing.NAType'> False True


In [12]:
df[df.index == 1165]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,YEAR_FROM,YEAR_TO
1165,Hustle,III,"\r\nComedy, Drama, Sport",,\r\nA washed-up basketball scout discovers a phenomenal street ball player while in China and sees the prospect as his opportunity to get back into the NBA.,"\r\n Director:\r\nJeremiah Zagar\r\n| \r\n Stars:\r\nAdam Sandler, \r\nBen Foster, \r\nRobert Duvall, \r\nQueen Latifah\r\n",,,,Movie,,


In [13]:
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,YEAR_FROM,YEAR_TO
0,Blood Red Sky,2021,"\r\nAction, Horror, Thriller",6.1,\r\nA woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.,"\r\n Director:\r\nPeter Thorwarth\r\n| \r\n Stars:\r\nPeri Baumeister, \r\nCarl Anton Koch, \r\nAlexander Scheer, \r\nKais Setti\r\n",21062.0,121.0,,Movie,2021,
1,Masters of the Universe: Revelation,2021–,"\r\nAnimation, Action, Adventure",5.0,\r\nThe war for Eternia begins again in what may be the final battle between He-Man and Skeletor. A new animated series from writer-director Kevin Smith.,"\r\n \r\n Stars:\r\nChris Wood, \r\nSarah Michelle Gellar, \r\nLena Headey, \r\nMark Hamill\r\n",17870.0,25.0,,Series,2021,
2,The Walking Dead,2010–2022,"\r\nDrama, Horror, Thriller",8.2,\r\nSheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.,"\r\n \r\n Stars:\r\nAndrew Lincoln, \r\nNorman Reedus, \r\nMelissa McBride, \r\nLauren Cohan\r\n",885805.0,44.0,,Series,2010,2022.0
3,Rick and Morty,2013–,"\r\nAnimation, Adventure, Comedy",9.2,\r\nAn animated series that follows the exploits of a super scientist and his not-so-bright grandson.,"\r\n \r\n Stars:\r\nJustin Roiland, \r\nChris Parnell, \r\nSpencer Grammer, \r\nSarah Chalke\r\n",414849.0,23.0,,Series,2013,
4,Army of Thieves,2021,"\r\nAction, Crime, Horror",,"\r\nA prequel, set before the events of Army of the Dead, which focuses on German safecracker Ludwig Dieter leading a group of aspiring thieves on a top secret heist during the early stages of the zombie apocalypse.","\r\n Director:\r\nMatthias Schweighöfer\r\n| \r\n Stars:\r\nMatthias Schweighöfer, \r\nNathalie Emmanuel, \r\nRuby O. Fee, \r\nStuart Martin\r\n",,,,Movie,2021,


In [14]:
df = df.astype({"YEAR_FROM": "Int32"})
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,YEAR_FROM,YEAR_TO
0,Blood Red Sky,2021,"\r\nAction, Horror, Thriller",6.1,\r\nA woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.,"\r\n Director:\r\nPeter Thorwarth\r\n| \r\n Stars:\r\nPeri Baumeister, \r\nCarl Anton Koch, \r\nAlexander Scheer, \r\nKais Setti\r\n",21062.0,121.0,,Movie,2021,
1,Masters of the Universe: Revelation,2021–,"\r\nAnimation, Action, Adventure",5.0,\r\nThe war for Eternia begins again in what may be the final battle between He-Man and Skeletor. A new animated series from writer-director Kevin Smith.,"\r\n \r\n Stars:\r\nChris Wood, \r\nSarah Michelle Gellar, \r\nLena Headey, \r\nMark Hamill\r\n",17870.0,25.0,,Series,2021,
2,The Walking Dead,2010–2022,"\r\nDrama, Horror, Thriller",8.2,\r\nSheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.,"\r\n \r\n Stars:\r\nAndrew Lincoln, \r\nNorman Reedus, \r\nMelissa McBride, \r\nLauren Cohan\r\n",885805.0,44.0,,Series,2010,2022.0
3,Rick and Morty,2013–,"\r\nAnimation, Adventure, Comedy",9.2,\r\nAn animated series that follows the exploits of a super scientist and his not-so-bright grandson.,"\r\n \r\n Stars:\r\nJustin Roiland, \r\nChris Parnell, \r\nSpencer Grammer, \r\nSarah Chalke\r\n",414849.0,23.0,,Series,2013,
4,Army of Thieves,2021,"\r\nAction, Crime, Horror",,"\r\nA prequel, set before the events of Army of the Dead, which focuses on German safecracker Ludwig Dieter leading a group of aspiring thieves on a top secret heist during the early stages of the zombie apocalypse.","\r\n Director:\r\nMatthias Schweighöfer\r\n| \r\n Stars:\r\nMatthias Schweighöfer, \r\nNathalie Emmanuel, \r\nRuby O. Fee, \r\nStuart Martin\r\n",,,,Movie,2021,


### Genre

In [15]:
df["GENRE"] = df["GENRE"].str.replace("\r\n", "", regex=False)
df.sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,YEAR_FROM,YEAR_TO
8147,Pandemic: How to Prevent an Outbreak,2020–,Documentary,6.5,"\r\nWorldwide, scientists test animals and their handlers for emerging viruses. In the U.S. and India, doctors work long hours caring for flu patients.","\r\n Directors:\r\nIsabel Castro, \r\nArianna LaPenne, \r\nDanni Mynard, \r\nDoug Shultz\r\n| \r\n Stars:\r\nHolly Goracke, \r\nDinesh Vijay, \r\nRaghu Sharma, \r\nDennis Carroll\r\n",123,,,Series,2020,
2013,Coco avant Chanel,2009,"Biography, Drama",6.7,\r\nThe story of Coco Chanel's rise from obscure beginnings to the heights of the fashion world.,"\r\n Director:\r\nAnne Fontaine\r\n| \r\n Stars:\r\nAudrey Tautou, \r\nBenoît Poelvoorde, \r\nAlessandro Nivola, \r\nMarie Gillain\r\n",39401,105.0,$6.11M,Movie,2009,
5034,Iron Ladies,2018–,Romance,5.9,"\r\nChou Kai-Ting, Ma Li-Sha and Wang Ching-Ching are women in their thirties, working as top managers in an e-commerce cosmetic website ""Love U shop"". They have beautiful appearance, perfect ... See full summary »\r\n","\r\n \r\n Stars:\r\nAviis Zhong, \r\nBen Wu, \r\nAda Pan, \r\nWills Sia\r\n",48,75.0,,Series,2018,
5698,Voltron 84,2017,"Animation, Action, Sci-Fi",8.2,\r\nAdd a Plot\r\n,"\r\n \r\n Stars:\r\nJeffrey Michael Adams, \r\nSteve Ahn, \r\nChristine Bian, \r\nKimberly Brooks\r\n",58,,,Movie,2017,
6789,Baby,2018–2020,Drama,7.9,"\r\nLudo gets infuriating news, Chiara and her parents start therapy, and Saverio invites the girls to a secret party, where a familiar face shows up.","\r\n Director:\r\nAndrea De Sica\r\n| \r\n Stars:\r\nBenedetta Porcaroli, \r\nAlice Pagani, \r\nRiccardo Mandolini, \r\nBrando Pacitto\r\n",117,41.0,,Series,2018,2020.0


In [16]:
genre = "Crime, Drama"
genre.split(",")

['Crime', ' Drama']

In [17]:
genre = "Drama, Family"
genre.split(",")

['Drama', ' Family']

In [18]:
df["GENRE"] = df["GENRE"].str.replace(" ", "", regex=False)
df.sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,YEAR_FROM,YEAR_TO
923,Devilman: Crybaby,2018,"Animation,Action,Fantasy",7.6,"\r\nWith demons reawakened and humanity in turmoil, a sensitive demon-boy is led into a brutal, degenerate war against evil by his mysterious friend, Ryo.","\r\n \r\n Stars:\r\nKoki Uchiyama, \r\nAyumu Murase, \r\nGriffin Burns, \r\nKyle McCarley\r\n",15076.0,25.0,,Movie,2018,
2586,Ni no kuni,2019,"Animation,Action,Adventure",6.1,"\r\nHigh schooler Yuu and his friend Haru get involved in a case involving his childhood friend Kotona, which forces them to go back and forth between another world that is different but is ... See full summary »\r\n","\r\n Director:\r\nYoshiyuki Momose\r\n| \r\n Stars:\r\nKento Yamazaki, \r\nTucker Chandler, \r\nRay Chase, \r\nGreg Chun\r\n",2909.0,106.0,,Movie,2019,
3041,Memorias de Idhún,2020–2021,"Animation,Action,Adventure",5.4,"\r\nThe day the three Suns and three Moons came together in alignment over Idhun, Ashran, the necromancer, seized power and the reign of the winged snakes started.","\r\n \r\n Stars:\r\nItzan Escamilla, \r\nMichelle Jenner, \r\nSergio Mur, \r\nNico Romero\r\n",475.0,25.0,,Series,2020,2021.0
5116,The Negotiators,2019–,Crime,,\r\nA deep dive into one of the world's most high-pressure professions: hostage negotiation.,\r\n,,,,Series,2019,
9723,Centaurworld,2021–,"Animation,Adventure,Comedy",8.0,"\r\nThe friends seek shelter in a cave during a storm - but they're not alone. While looking for an exit, they discover strange paintings on the walls.","\r\n \r\n Stars:\r\nKimiko Glenn, \r\nMegan Hilty, \r\nParvesh Cheena, \r\nChris Diamantopoulos\r\n",8.0,26.0,,Series,2021,


In [19]:
df_dummies = df["GENRE"].str.get_dummies(sep=",")
df_dummies = df_dummies.add_prefix("Genre_")
df_dummies.head()

Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df_dummies.shape

(9999, 27)

In [21]:
df = df.merge(df_dummies, how="inner", left_index=True, right_index=True)
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
0,Blood Red Sky,2021,"Action,Horror,Thriller",6.1,\r\nA woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.,"\r\n Director:\r\nPeter Thorwarth\r\n| \r\n Stars:\r\nPeri Baumeister, \r\nCarl Anton Koch, \r\nAlexander Scheer, \r\nKais Setti\r\n",21062.0,121.0,,Movie,...,0,0,0,0,0,0,0,1,0,0
1,Masters of the Universe: Revelation,2021–,"Animation,Action,Adventure",5.0,\r\nThe war for Eternia begins again in what may be the final battle between He-Man and Skeletor. A new animated series from writer-director Kevin Smith.,"\r\n \r\n Stars:\r\nChris Wood, \r\nSarah Michelle Gellar, \r\nLena Headey, \r\nMark Hamill\r\n",17870.0,25.0,,Series,...,0,0,0,0,0,0,0,0,0,0
2,The Walking Dead,2010–2022,"Drama,Horror,Thriller",8.2,\r\nSheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.,"\r\n \r\n Stars:\r\nAndrew Lincoln, \r\nNorman Reedus, \r\nMelissa McBride, \r\nLauren Cohan\r\n",885805.0,44.0,,Series,...,0,0,0,0,0,0,0,1,0,0
3,Rick and Morty,2013–,"Animation,Adventure,Comedy",9.2,\r\nAn animated series that follows the exploits of a super scientist and his not-so-bright grandson.,"\r\n \r\n Stars:\r\nJustin Roiland, \r\nChris Parnell, \r\nSpencer Grammer, \r\nSarah Chalke\r\n",414849.0,23.0,,Series,...,0,0,0,0,0,0,0,0,0,0
4,Army of Thieves,2021,"Action,Crime,Horror",,"\r\nA prequel, set before the events of Army of the Dead, which focuses on German safecracker Ludwig Dieter leading a group of aspiring thieves on a top secret heist during the early stages of the zombie apocalypse.","\r\n Director:\r\nMatthias Schweighöfer\r\n| \r\n Stars:\r\nMatthias Schweighöfer, \r\nNathalie Emmanuel, \r\nRuby O. Fee, \r\nStuart Martin\r\n",,,,Movie,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df.columns

Index(['MOVIES', 'YEAR', 'GENRE', 'RATING', 'ONE-LINE', 'STARS', 'VOTES',
       'RunTime', 'Gross', 'TYPE', 'YEAR_FROM', 'YEAR_TO', 'Genre_Action',
       'Genre_Adventure', 'Genre_Animation', 'Genre_Biography', 'Genre_Comedy',
       'Genre_Crime', 'Genre_Documentary', 'Genre_Drama', 'Genre_Family',
       'Genre_Fantasy', 'Genre_Film-Noir', 'Genre_Game-Show', 'Genre_History',
       'Genre_Horror', 'Genre_Music', 'Genre_Musical', 'Genre_Mystery',
       'Genre_News', 'Genre_Reality-TV', 'Genre_Romance', 'Genre_Sci-Fi',
       'Genre_Short', 'Genre_Sport', 'Genre_Talk-Show', 'Genre_Thriller',
       'Genre_War', 'Genre_Western'],
      dtype='object')

In [23]:
df.sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
9368,Gojira shingyura pointo,2021–,"Animation,Action,Adventure",7.5,"\r\nGodzilla continues to advance on Tokyo, covering the city in red dust. The Otaki crew rushes to rescue a person ensnared by spider-like monster.","\r\n Directors:\r\nDaisuke Chiba, \r\nTakuma Suzuki, \r\nAtsushi Takahashi\r\n| \r\n Star:\r\nCristina Valenzuela\r\n",51.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
2686,The Girls I've Been,,Thriller,,\r\nA con artist named Nora uses her powers of persuasion and impersonation to get herself and her friends out of dangerous situations.,\r\n \r\n Star:\r\nMillie Bobby Brown\r\n,,,,Series,...,0,0,0,0,0,0,0,1,0,0
1215,Noragami,2014–2016,"Animation,Action,Adventure",7.9,"\r\nA minor god seeking to gain widespread worship teams up with a human girl he saved to gain fame, recognition and at least one shrine dedicated to him.","\r\n \r\n Stars:\r\nHiroshi Kamiya, \r\nMaaya Uchida, \r\nYûki Kaji, \r\nBryn Apprill\r\n",12511.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0
2624,Grizzy and the Lemmings,2007–,"Animation,Comedy,Family",7.0,"\r\nThe forest ranger's house is the only area of human civilization in the middle of untamed wilderness in a vast natural reserve in Canada. When the ranger is away, a bear named Grizzy feels ... See full summary »\r\n","\r\n \r\n Stars:\r\nPierre-Alain de Garrigues, \r\nJosselin Charier\r\n",445.0,7.0,,Series,...,0,0,0,0,0,0,0,0,0,0
5222,Ellen DeGeneres: Relatable,2018 TV Special,Comedy,6.5,\r\nComedian Ellen DeGeneres performs a live stand-up comedy set in Seattle.,"\r\n Directors:\r\nJoel Gallen, \r\nTig Notaro\r\n| \r\n Stars:\r\nLaura Dern, \r\nEllen DeGeneres, \r\nEddie Vedder, \r\nJill Vedder\r\n",4482.0,68.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


### Stars

In [24]:
df["STARS"] = df["STARS"].str.replace("\r\n", "", regex=False)
df.sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
5422,Thomas & Friends: Marvelous Machinery,2020,"Animation,Action,Adventure",6.0,\r\nAdd a Plot\r\n,"Director:Joey So| Stars:Laura Beaumont, Jules de Jongh, Teresa Gallagher, Bob Golding",9,45.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
4509,No Estoy Loca,2018,"Comedy,Drama",5.7,\r\nCarolina discovers that she can't have children the same day that her husband leaves her for her best friend. She attempts suicide and is hospitalized in a psychiatric clinic.,"Director:Nicolás López| Stars:Paz Bascuñán, Marcial Tagle, Antonia Zegers, Carolina Paulsen",894,116.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
1005,Zwartboek,2006,"Drama,Thriller,War",7.7,"\r\nIn the Nazi-occupied Netherlands during World War II, a Jewish singer infiltrates the regional Gestapo headquarters for the Dutch resistance.","Director:Paul Verhoeven| Stars:Carice van Houten, Sebastian Koch, Thom Hoffman, Halina Reijn",74139,145.0,$4.40M,Movie,...,0,0,0,0,0,0,0,1,1,0
5272,Sunrise,I 2014,"Crime,Drama,Mystery",6.3,"\r\nBefore sunrise, Joshi must catch the elusive figure that destroys the lives of children.","Director:Partho Sen-Gupta| Stars:Adil Hussain, Tannishtha Chatterjee, Gulnaaz Ansari, Komal Gupta",279,85.0,$0.00M,Movie,...,0,0,0,0,0,0,0,0,0,0
5002,Sugar High,2020 TV Special,Reality-TV,7.3,"\r\nContestants compete in two rounds of sugar sculpting challenges for a prize of $10,000.","Director:Ariel Boles| Stars:Hunter March, Jackie Sorkin, Rebecca DeAngelis, Stéphane Tréand",136,44.0,,Movie,...,0,1,0,0,0,0,0,0,0,0


In [25]:
df["Directors"] = None
df["Actors"] = None

def extract_directors(x):
    if "Director:" in x or "Directors:" in x:
        stars = x.split("|")
        if "Director:" in stars[0] or "Directors:" in stars[0]:
            return stars[0]
        else:
            return stars[1]
    else:
        return None

def extract_actors(x):
    if "Star:" in x or "Stars:" in x:
        stars = x.split("|")
        if "Star:" in stars[0] or "Stars:" in stars[0]:
            return stars[0]
        else:
            return stars[1]
    else:
        return None

df["Directors"] = df["STARS"].progress_apply(extract_directors)
df["Actors"] = df["STARS"].progress_apply(extract_actors)

df["Directors"] = df["Directors"].str.replace("Director:", "", regex=False)
df["Directors"] = df["Directors"].str.replace("Directors:", "", regex=False)
df["Actors"] = df["Actors"].str.replace("Star:", "", regex=False)
df["Actors"] = df["Actors"].str.replace("Stars:", "", regex=False)

df.sample(5)

100%|██████████████████████████████████████████████████████████████████████████| 9999/9999 [00:00<00:00, 664662.04it/s]
100%|██████████████████████████████████████████████████████████████████████████| 9999/9999 [00:00<00:00, 499028.40it/s]


Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western,Directors,Actors
8757,Floor Is Lava,2020–,"Game-Show,Reality-TV",6.4,\r\nAdd a Plot\r\n,Director:Brian Smith| Star:Rutledge Wood,63.0,26.0,,Series,...,0,0,0,0,0,0,0,0,Brian Smith,Rutledge Wood
71,Prison Break,2005–2017,"Action,Crime,Drama",8.3,"\r\nDue to a political conspiracy, an innocent man is sent to death row and his only hope is his brother, who makes it his mission to deliberately get himself sent to the same prison in order to break the both of them out, from the inside.","Stars:Dominic Purcell, Wentworth Miller, Amaury Nolasco, Robert Knepper",492438.0,44.0,,Series,...,0,0,0,0,0,0,0,0,,"Dominic Purcell, Wentworth Miller, Amaury Nolasco, Robert Knepper"
2757,The Silent Sea,2021–,"Adventure,Drama,Horror",,\r\nA precarious future where Earth is running out of water. It follows a group of elite scientists who set off for the moon to retrieve some unknown samples from an abandoned research station.,"Stars:Bae Doona, Joon Lee, Gong Yoo, Heo Sung-tae",,,,Series,...,0,0,0,0,0,0,0,0,,"Bae Doona, Joon Lee, Gong Yoo, Heo Sung-tae"
3267,The Trial,2010,"Crime,Drama,Mystery",5.6,"\r\nAfter the horrific death of his wife and two sons, suicide seems to be the only escape for a small town attorney, until he's assigned a capital punishment case that begins to transform his life.","Director:Gary Wheeler| Stars:Larry Bagby, Clare Carey, Nikki Deloach, David Dwyer",1462.0,101.0,$0.02M,Movie,...,0,0,0,0,0,0,0,0,Gary Wheeler,"Larry Bagby, Clare Carey, Nikki Deloach, David Dwyer"
1647,La noche de 12 años,2018,"Biography,Crime,Drama",7.5,"\r\n1973. Uruguay is governed by a military dictatorship. One autumn night, three Tupamaro prisoners are taken from their jail cells in a secret military operation. The order is precise: ""As we... See full summary »\r\n","Director:Álvaro Brechner| Stars:Antonio de la Torre, Chino Darín, Alfonso Tort, César Troncoso",9150.0,122.0,,Movie,...,0,0,0,0,0,0,0,0,Álvaro Brechner,"Antonio de la Torre, Chino Darín, Alfonso Tort, César Troncoso"


In [26]:
df_dummies = df["Directors"].str.get_dummies(sep=", ")
df_dummies = df_dummies.add_prefix("Director_")
df_dummies.head()

Unnamed: 0,Director_ Aadish Keluskar,Director_ Aaron Augenblick,Director_ Aaron Burns,Director_ Aaron Hann,Director_ Aaron Lieber,Director_ Aaron Long,Director_ Aaron Sorkin,Director_ Abbas Alibhai Burmawalla,Director_ Abdelhamid Bouchnak,Director_ Abhay Kumar,...,Director_Yoshinori Odaka,Director_Yoshitaka Takeuch,Director_Yoshiyuki Okada,Director_Young Kyun Park,Director_Yudai Yamaguchi,Director_Yue Song,Director_Zara Serabian-Arthur,Director_Zesung Kang,Director_Àlex Pastor,Director_Óscar Pedraza
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df = df.merge(df_dummies, how="inner", left_index=True, right_index=True)
df.sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Yoshinori Odaka,Director_Yoshitaka Takeuch,Director_Yoshiyuki Okada,Director_Young Kyun Park,Director_Yudai Yamaguchi,Director_Yue Song,Director_Zara Serabian-Arthur,Director_Zesung Kang,Director_Àlex Pastor,Director_Óscar Pedraza
8734,Kipo and the Age of Wonderbeasts,2020,"Animation,Action,Adventure",8.2,"\r\nKipo and company search for answers -and an anchor -in her parents' old quarter. But are they alone? Above, mysterious forces gather against a ruler.","Directors:Young Ki Yoon, Bridget Underwood| Stars:Karen Fukuhara, Sydney Mikayla, Deon Cole, Coy Stewart",99.0,,,Movie,...,0,0,0,0,0,0,0,0,0,0
7938,The Umbrella Academy,2019–,"Action,Adventure,Comedy",9.1,"\r\nReeling from the events at Dealey Plaza, the siblings head to the farm to help save Harlan - only to find themselves drawn into a deadly showdown.","Director:Jeremy Webb| Stars:Elliot Page, Tom Hopper, David Castañeda, Emmy Raver-Lampman",3928.0,49.0,,Series,...,0,0,0,0,0,0,0,0,0,0
9407,Caïd,2021–,"Action,Crime,Drama",7.6,\r\nAdd a Plot\r\n,Star:Jean-Toussaint Bernard,31.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
9009,Buddi,2020–,Animation,,"\r\nWith kindness, curiosity and childlike wonder, five best friends explore their colourful world and find the extraordinary in everyday things.","Directors:Robert Milne, Karen Ullmann, Jason Jameson| Stars:Greta Jameson, Ralf Jameson, Clementine Laikin, Felix Laikin",,,,Series,...,0,0,0,0,0,0,0,0,0,0
9800,Hagane no renkinjutsushi,2009–2012,"Animation,Action,Adventure",8.9,"\r\nWhen the conspiracy surrounding the Philosopher's Stone closes in on the highest ranks of the military, one of Mustang's most trusted and cherished allies is murdered.","Director:Yasuhiro Irie| Stars:Vic Mignogna, Maxey Whitehead, Sonny Strait, Travis Willingham",1358.0,30.0,,Series,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df_dummies = df["Actors"].str.get_dummies(sep=", ")
df_dummies = df_dummies.add_prefix("Actor_")
df_dummies.head()

Unnamed: 0,Actor_ 2'Live Bre,Actor_ Aakash Gupta,Actor_ Aarnaa Sharma,Actor_ Aaron Irvin,Actor_ Aaron Stanford,Actor_ Aayush Ailawadi,Actor_ Abbi Jacobson,Actor_ Abby Bergman,Actor_ Abdramane Diakite,Actor_ Abdur-Rahman Muhammad,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
df = df.merge(df_dummies, how="inner", left_index=True, right_index=True)
df.sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
3619,Milly & Mamet: Ini Bukan Cinta & Rangga,2018,"Comedy,Drama",7.4,"\r\nMamet obeyed Milly's request to stop working at a banker bank to raise their children, to work in his in-laws factory to meet Alexandra who again invited her to become a chef at the restaurant of her second dream while in college.","Director:Ernest Prakasa| Stars:Julie Estelle, Yoshi Sudarso, Adinia Wirasti, Dian Sastrowardoyo",684.0,101.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
3179,Bad Boy Billionaires: India,2020–,"Documentary,Biography,Crime",8.0,"\r\nThis investigative docuseries explores the greed, fraud and corruption that built up - and ultimately brought down - India's most infamous tycoons.",Star:Aayush Ailawadi,2687.0,59.0,,Series,...,0,0,0,0,0,0,0,0,0,0
7239,The Sandman,2021–,"Action,Adventure,Drama",,\r\nAdd a Plot\r\n,"Director:Jamie Childs| Stars:Jenna Coleman, David Thewlis, Patton Oswalt, Kirby Howell-Baptiste",,,,Series,...,0,0,0,0,0,0,0,0,0,0
3112,26 nyeon,2012,"Drama,Thriller",6.3,"\r\n26 years ago, state troops were ordered to open fire on civilians in the city of Gwangju who were demonstrating as apart of a democratic movement. Thousands of civilians were killed. Now, a... See full summary »\r\n","Director:Geun-hyun Cho| Stars:Jin Goo, Hye-jin Han, Soo-bin Bae, Seul-ong Im",1054.0,135.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
1490,Gekijouban Bishoujo Senshi Sailor Moon Eternal,2021,"Animation,Action,Adventure",7.1,"\r\nWhen a dark power enshrouds the Earth after a total solar eclipse, the scattered Sailor Guardians must reunite to bring light back into the world.","Director:Chiaki Kon| Stars:Kotono Mitsuishi, Stephanie Sheh, Kate Higgins, Cristina Valenzuela",932.0,160.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df.shape

(9999, 23832)

## Missing Values

In [42]:
column_names = ["MOVIES", "GENRE", "RATING", "ONE-LINE",
                "STARS", "VOTES", "RunTime", "Gross",
                "TYPE", "YEAR_FROM", "YEAR_TO"]
def check_missing():
    df_missing = df[column_names].isna().sum().to_frame()
    df_missing["percentage"] = (df_missing[0] / df.shape[0]) * 100
    return df_missing

check_missing()

Unnamed: 0,0,percentage
MOVIES,0,0.0
GENRE,80,0.80008
RATING,1820,18.20182
ONE-LINE,0,0.0
STARS,0,0.0
VOTES,1820,18.20182
RunTime,0,0.0
Gross,9539,95.39954
TYPE,0,0.0
YEAR_FROM,748,7.480748


### YEAR

In [36]:
df["YEAR_TO"] = np.where(df["YEAR_TO"].isna(), df["YEAR_FROM"], df["YEAR_TO"])
check_missing()

Unnamed: 0,0,percentage
MOVIES,0,0.0
GENRE,80,0.80008
RATING,1820,18.20182
ONE-LINE,0,0.0
STARS,0,0.0
VOTES,1820,18.20182
RunTime,2958,29.582958
Gross,9539,95.39954
TYPE,0,0.0
YEAR_FROM,748,7.480748


### RunTime

In [38]:
df[["RunTime"]].describe()

Unnamed: 0,RunTime
count,7041.0
mean,68.688539
std,47.258056
min,1.0
25%,36.0
50%,60.0
75%,95.0
max,853.0


In [39]:
df[["RunTime"]][df["TYPE"] == "Movie"].describe()

Unnamed: 0,RunTime
count,4137.0
mean,89.269761
std,46.489358
min,1.0
25%,64.0
50%,90.0
75%,105.0
max,573.0


In [40]:
df[["RunTime"]][df["TYPE"] == "Series"].describe()

Unnamed: 0,RunTime
count,2904.0
mean,39.368802
std,29.549575
min,1.0
25%,24.0
50%,38.0
75%,47.0
max,853.0


In [41]:
df.loc[(df["RunTime"].isna()) & (df["TYPE"] == "Movie"), "RunTime"] = 90
df.loc[(df["RunTime"].isna()) & (df["TYPE"] == "Series"), "RunTime"] = 40
check_missing()

Unnamed: 0,0,percentage
MOVIES,0,0.0
GENRE,80,0.80008
RATING,1820,18.20182
ONE-LINE,0,0.0
STARS,0,0.0
VOTES,1820,18.20182
RunTime,0,0.0
Gross,9539,95.39954
TYPE,0,0.0
YEAR_FROM,748,7.480748


### Missing Values in Row

In [48]:
df_missing = df[column_names].isna().sum(axis=1).to_frame()
df_missing.describe()

Unnamed: 0,0
count,9999.0
mean,1.475648
std,1.226247
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,6.0


In [50]:
df[df_missing[0] > 4].sample(3)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
9295,Archive 81,,"Drama,Horror,Sci-Fi",,\r\nAdd a Plot\r\n,"Directors:Rebecca Thomas, James Wan| Stars:Evan Jonigkeit, Dina Shihabi, Matt McGorry, Martin Donovan",,60.0,,Series,...,0,0,0,0,0,0,0,0,0,0
9540,First Kill,,"Drama,Horror,Mystery",,\r\nAdd a Plot\r\n,"Stars:Elizabeth Mitchell, Gracie Dzienny, Sarah Catherine Hook, Will Swenson",,40.0,,Series,...,0,0,0,0,0,0,0,0,0,0
6252,Yuba,,Western,,\r\nProspectors look for gold in the lawless region of the Yuba river valley.,,,40.0,,Series,...,0,0,0,0,0,0,0,0,0,0


### Drop Null Values

In [51]:
df = df.dropna(subset=["GENRE", "RATING", "YEAR_FROM", "YEAR_TO"])
check_missing()

Unnamed: 0,0,percentage
MOVIES,0,0.0
GENRE,0,0.0
RATING,0,0.0
ONE-LINE,0,0.0
STARS,0,0.0
VOTES,0,0.0
RunTime,0,0.0
Gross,7708,94.368266
TYPE,0,0.0
YEAR_FROM,0,0.0


### Dropping Columns

In [52]:
df = df.drop(columns=["MOVIES", "YEAR", "GENRE", "ONE-LINE", "STARS", "VOTES",
                      "Gross", "Directors", "Actors"])
df.shape

(8168, 23824)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8168 entries, 0 to 9979
Columns: 23823 entries, RATING to Actor_Þorsteinn Bachmann
dtypes: Int32(1), float64(2), int64(23818), object(2)
memory usage: 1.4+ GB


In [58]:
df.dtypes

RATING                      float64
RunTime                     float64
TYPE                         object
YEAR_FROM                     Int32
YEAR_TO                      object
                             ...   
Actor_Özz Nûjen               int64
Actor_Úrsula Corberó          int64
Actor_Úrsula Pruneda          int64
Actor_Ülkü Duru               int64
Actor_Þorsteinn Bachmann      int64
Length: 23823, dtype: object

In [59]:
df = df.astype({"YEAR_TO": "Int32"})
df.dtypes

RATING                      float64
RunTime                     float64
TYPE                         object
YEAR_FROM                     Int32
YEAR_TO                       Int32
                             ...   
Actor_Özz Nûjen               int64
Actor_Úrsula Corberó          int64
Actor_Úrsula Pruneda          int64
Actor_Ülkü Duru               int64
Actor_Þorsteinn Bachmann      int64
Length: 23823, dtype: object

## Saving

In [63]:
df.to_parquet("data/movies-clean.parquet")